From e15af206c133896130d2ca43274e90a1dc7cdd8f Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 7 Nov 2022 16:12:17 -0600
Subject: [PATCH 01/43] feat: ported msm-affine

---
 ecc/bls12-377/g1.go                           |   74 +
 ecc/bls12-377/g2.go                           |   74 +
 ecc/bls12-377/multiexp_affine.go              | 1883 +++++++++++++++++
 ecc/bls12-377/multiexp_test.go                |   84 +-
 ecc/bls12-378/g1.go                           |   74 +
 ecc/bls12-378/g2.go                           |   74 +
 ecc/bls12-378/multiexp_affine.go              | 1883 +++++++++++++++++
 ecc/bls12-378/multiexp_test.go                |   84 +-
 ecc/bls12-381/g1.go                           |   74 +
 ecc/bls12-381/g2.go                           |   74 +
 ecc/bls12-381/multiexp_affine.go              | 1883 +++++++++++++++++
 ecc/bls12-381/multiexp_test.go                |   84 +-
 ecc/bls24-315/g1.go                           |   74 +
 ecc/bls24-315/g2.go                           |   74 +
 ecc/bls24-315/multiexp_affine.go              | 1883 +++++++++++++++++
 ecc/bls24-315/multiexp_test.go                |   84 +-
 ecc/bls24-317/g1.go                           |   74 +
 ecc/bls24-317/g2.go                           |   74 +
 ecc/bls24-317/multiexp_affine.go              | 1883 +++++++++++++++++
 ecc/bls24-317/multiexp_test.go                |   84 +-
 ecc/bn254/g1.go                               |   74 +
 ecc/bn254/g2.go                               |   74 +
 ecc/bn254/multiexp_affine.go                  | 1883 +++++++++++++++++
 ecc/bn254/multiexp_test.go                    |   84 +-
 ecc/bw6-633/g1.go                             |   74 +
 ecc/bw6-633/g2.go                             |   74 +
 ecc/bw6-633/multiexp_affine.go                |  857 ++++++++
 ecc/bw6-633/multiexp_test.go                  |   84 +-
 ecc/bw6-756/g1.go                             |   74 +
 ecc/bw6-756/g2.go                             |   74 +
 ecc/bw6-756/multiexp_affine.go                |  857 ++++++++
 ecc/bw6-756/multiexp_test.go                  |   84 +-
 ecc/bw6-761/g1.go                             |   74 +
 ecc/bw6-761/g2.go                             |   74 +
 ecc/bw6-761/multiexp_affine.go                |  857 ++++++++
 ecc/bw6-761/multiexp_test.go                  |   84 +-
 internal/generator/ecc/generate.go            |    1 +
 .../ecc/template/multiexp_affine.go.tmpl      |  474 +++++
 internal/generator/ecc/template/point.go.tmpl |   79 +
 .../ecc/template/tests/multiexp.go.tmpl       |   43 +-
 40 files changed, 16535 insertions(+), 19 deletions(-)
 create mode 100644 ecc/bls12-377/multiexp_affine.go
 create mode 100644 ecc/bls12-378/multiexp_affine.go
 create mode 100644 ecc/bls12-381/multiexp_affine.go
 create mode 100644 ecc/bls24-315/multiexp_affine.go
 create mode 100644 ecc/bls24-317/multiexp_affine.go
 create mode 100644 ecc/bn254/multiexp_affine.go
 create mode 100644 ecc/bw6-633/multiexp_affine.go
 create mode 100644 ecc/bw6-756/multiexp_affine.go
 create mode 100644 ecc/bw6-761/multiexp_affine.go
 create mode 100644 internal/generator/ecc/template/multiexp_affine.go.tmpl

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 28be402bbf..bc9027480a 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 8b8ecb3161..fdf535ca82 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -975,3 +975,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E2
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E2
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E2
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E2, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+
+	var accumulator fptower.E2
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
new file mode 100644
index 0000000000..876ae12f01
--- /dev/null
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 4e83dd849e..9e40c04401 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 17d313fedf..fd8fbe7ee0 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index ed5da711ae..479cda7053 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -975,3 +975,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E2
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E2
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E2
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E2, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+
+	var accumulator fptower.E2
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
new file mode 100644
index 0000000000..b92b826e91
--- /dev/null
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12378
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 3e2c1ae6cf..466e6499a1 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index c7291c7130..189c5ac202 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -979,3 +979,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 168943cd91..3473f3d002 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -976,3 +976,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E2
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E2
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E2
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E2, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+
+	var accumulator fptower.E2
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
new file mode 100644
index 0000000000..7970a61d7e
--- /dev/null
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12381
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 0f9d048801..05f44f6112 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index bcf23fcd01..3209c210ad 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -981,3 +981,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index d7a009999d..7f377b8147 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -991,3 +991,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E4
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E4
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E4
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E4, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) {
+
+	var accumulator fptower.E4
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
new file mode 100644
index 0000000000..e08952a333
--- /dev/null
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls24315
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 7cc9521976..6f5611f563 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index b740a62a2e..a7198ef2ea 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -981,3 +981,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 35d76023aa..907c1db13b 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -991,3 +991,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E4
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E4
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E4
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E4, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) {
+
+	var accumulator fptower.E4
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
new file mode 100644
index 0000000000..6623e42510
--- /dev/null
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls24317
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 3f435be696..b89f8a2375 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 05c3663fce..4056491a53 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -951,3 +951,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index db2118d180..deeb006578 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -980,3 +980,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fptower.E2
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fptower.E2
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fptower.E2
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfptower.E2, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+
+	var accumulator fptower.E2
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
new file mode 100644
index 0000000000..d91b3cb89c
--- /dev/null
+++ b/ecc/bn254/multiexp_affine.go
@@ -0,0 +1,1883 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bn254
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G1Affine
+			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g1JacExtended
+			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 6:
+		p.msmC6(points, scalars, splitFirstChunk)
+
+	case 7:
+		p.msmC7(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 9:
+		p.msmC9(points, scalars, splitFirstChunk)
+
+	case 10:
+		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+
+	case 11:
+		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+
+	case 12:
+		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+
+	case 13:
+		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+
+	case 14:
+		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+
+	case 15:
+		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	case 20:
+		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+
+	case 21:
+		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 10                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 11                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 12                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 13                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 14                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 15                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 20                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
+
+func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 21                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks + 1]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// c doesn't divide 256, last window is smaller we can allocate less buckets
+	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+	// TODO @gbotrel replace this in code generator
+	if lastC >= 10 {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]G2Affine
+			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	} else {
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			var buckets [1 << (lastC - 1)]g2JacExtended
+			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
+		}(uint64(nbChunks), points, scalars)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 16db1a5bbf..61341020f0 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index d8a25ab2a8..f70d2b30cc 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1083,3 +1083,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index 8019989178..f9284d2ec7 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -946,3 +946,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
new file mode 100644
index 0000000000..79740b7a69
--- /dev/null
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -0,0 +1,857 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6633
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 401434b1bf..282b60e573 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index 6cc8b67f14..038e4f1b42 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1083,3 +1083,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 171069cc4d..cb9fadd15d 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -940,3 +940,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
new file mode 100644
index 0000000000..427f4d3891
--- /dev/null
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -0,0 +1,857 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6756
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 6fca10f2eb..d101a5c9a6 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index e940a54575..765d29433b 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1094,3 +1094,77 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	toReturnAff := BatchJacobianToAffineG1(toReturn)
 	return toReturnAff
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G1Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index cef585280a..fdb98731d4 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -954,3 +954,77 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	})
 	return toReturn
 }
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]fp.Element
+
+	{
+		var lambdain [MAX_BATCH_SIZE]fp.Element
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d fp.Element
+	var rr G2Affine
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+// batch inversion
+// similar to BatchInvertfp.Element, ignores edge cases
+func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+
+	var accumulator fp.Element
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
new file mode 100644
index 0000000000..09004ae309
--- /dev/null
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -0,0 +1,857 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6761
+
+import (
+	"errors"
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
+	var _p G1Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G1Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG1Affine struct {
+	P               [MAX_BATCH_SIZE]G1Affine
+	R               [MAX_BATCH_SIZE]*G1Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G1Affine
+}
+
+func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG1Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG1Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG1Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG1Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+	chRes chan<- g1JacExtended,
+	buckets []G1Affine,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG1Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g1JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
+		var buckets [1 << (c - 1)]G1Affine
+		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g1JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{4, 5, 8, 16}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+
+	switch c {
+
+	case 4:
+		p.msmC4(points, scalars, splitFirstChunk)
+
+	case 5:
+		p.msmC5(points, scalars, splitFirstChunk)
+
+	case 8:
+		p.msmC8(points, scalars, splitFirstChunk)
+
+	case 16:
+		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+type BatchG2Affine struct {
+	P               [MAX_BATCH_SIZE]G2Affine
+	R               [MAX_BATCH_SIZE]*G2Affine
+	batchSize       int
+	cptP            int
+	bucketIds       map[uint32]struct{}
+	buckets, points []G2Affine
+}
+
+func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+	chRes chan<- g2JacExtended,
+	buckets []G2Affine,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatchG2Affine(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	const (
+		c        = 16                  // scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks]chan g2JacExtended
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
+		var buckets [1 << (c - 1)]G2Affine
+		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j > 0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan g2JacExtended, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+}
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 07de759142..d5b1288c1e 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -187,6 +187,39 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G1Affine
+			var g G1Jac
+			g.Set(&g1Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g1Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G1Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -245,12 +278,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -478,6 +518,39 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
+	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]G2Affine
+			var g G2Jac
+			g.Set(&g2Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&g2Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+			var result1, result2 G2Jac
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
+				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -536,12 +609,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 594e1343cd..bc367c5c14 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -15,6 +15,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 
 	entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "multiexp.go"), Templates: []string{"multiexp.go.tmpl"}},
+		{File: filepath.Join(baseDir, "multiexp_affine.go"), Templates: []string{"multiexp_affine.go.tmpl"}},
 		{File: filepath.Join(baseDir, "multiexp_test.go"), Templates: []string{"tests/multiexp.go.tmpl"}},
 		{File: filepath.Join(baseDir, "marshal.go"), Templates: []string{"marshal.go.tmpl"}},
 		{File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}},
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
new file mode 100644
index 0000000000..02d8c72588
--- /dev/null
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -0,0 +1,474 @@
+{{ $G1TAffine := print (toUpper .G1.PointName) "Affine" }}
+{{ $G1TJacobian := print (toUpper .G1.PointName) "Jac" }}
+{{ $G1TJacobianExtended := print (toLower .G1.PointName) "JacExtended" }}
+
+{{ $G2TAffine := print (toUpper .G2.PointName) "Affine" }}
+{{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }}
+{{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
+	"github.com/consensys/gnark-crypto/ecc"
+	"errors"
+	"math"
+	"runtime"
+)
+
+const MAX_BATCH_SIZE = 600
+
+type batchOp struct {
+	bucketID, pointID uint32
+}
+
+func (o batchOp) isNeg() bool {
+	return o.pointID&1 == 1
+}
+
+
+
+{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
+
+
+{{define "multiexp" }}
+
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *{{ $.TAffine }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) {
+	var _p {{$.TJacobian}}
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
+
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *{{ $.TJacobian }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
+
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
+
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	}
+
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{
+			{{- range $c :=  $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}}
+		}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C 
+	}
+
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C)  // number of c-bit radixes in a scalar
+		if (fr.Limbs * 64) % C != 0 {
+			nbChunks ++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
+
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInner{{ $.TJacobian }}BatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]{{ $.TJacobian }}, nbSplits - 1)
+	chDone := make(chan int, nbSplits - 1)
+	for i:=0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInner{{ $.TJacobian }}BatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
+
+	msmInner{{ $.TJacobian }}BatchAffine(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	for i:=0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
+	}
+	close(chDone)
+	return p, nil
+}
+
+func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
+
+	switch c {
+	{{range $c :=  $.CRange}}
+	case {{$c}}:
+		{{- if le $c 9}}
+		p.msmC{{$c}}(points, scalars, splitFirstChunk)
+		{{- else}}
+		p.batchAffineMsmC{{$c}}(points, scalars, splitFirstChunk)
+		{{- end}}
+	{{end}}
+	default:
+		panic("not implemented")
+	}
+}
+
+// msmReduceChunk{{ $.TAffine }}BatchAffine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunk{{ $.TAffine }}BatchAffine(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }})  *{{ $.TJacobian }} {
+	var _p {{ $.TJacobianExtended }}
+	totalj := <-chChunks[len(chChunks)-1]
+    _p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
+
+type Batch{{ $.TAffine }} struct {
+	P            [MAX_BATCH_SIZE]{{ $.TAffine }}
+	R            [MAX_BATCH_SIZE]*{{ $.TAffine }}
+	batchSize       int
+	cptP		      int
+	bucketIds       map[uint32]struct{}
+	buckets, points []{{ $.TAffine }}
+}
+
+func newBatch{{ $.TAffine }}(buckets, points []{{ $.TAffine }}) Batch{{ $.TAffine }} {
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return Batch{{ $.TAffine }}{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+	}
+}
+
+func (b *Batch{{ $.TAffine }}) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *Batch{{ $.TAffine }}) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAdd{{ $.TAffine }}(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *Batch{{ $.TAffine }}) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *Batch{{ $.TAffine }}) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	B := &b.buckets[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if B.IsInfinity() {
+		if op.isNeg() {
+			B.Neg(P)
+		} else {
+			B.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if B.Equal(P) {
+			B.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
+			B.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = B
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
+}
+
+func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64,
+	 chRes chan<- {{ $.TJacobianExtended }},
+	 buckets []{{ $.TAffine }},
+	 c uint64,
+	 points []{{ $.TAffine }},
+	 scalars []fr.Element) {
+
+	mask  := uint64((1 << c) - 1)	// low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	batch := newBatch{{ $.TAffine }}(buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueue{{ $.TAffine }}(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
+	}
+
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total {{ $.TJacobianExtended }}
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+
+{{range $c :=  $.CRange}}
+{{- if gt $c 9}}
+{{- $frBits := mul $.FrNbWords 64}}
+{{- $cDividesBits := divides $c $frBits}}
+{{- $nbChunks := div $frBits $c}}
+
+func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
+	const (
+		c  = {{$c}} 							// scalars partitioned into c-bit radixes
+		nbChunks = (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
+	)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }}
+	for i:=0; i < len(chChunks);i++ {
+		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
+	}
+
+
+	{{ if not $cDividesBits }}
+
+		// c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets
+		const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+		// TODO @gbotrel replace this in code generator 
+		if lastC >= 10 {
+			go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
+				var buckets [1<<(lastC-1)]{{ $.TAffine }}
+				msmProcessChunk{{ $.TAffine }}BatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
+			}(uint64(nbChunks), points, scalars)
+		} else {
+			go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
+				var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }}
+				msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars)
+			}(uint64(nbChunks), points, scalars)
+		}
+	{{- end}}
+
+	processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) {
+		var buckets [1<<(c-1)]{{ $.TAffine }}
+		msmProcessChunk{{ $.TAffine }}BatchAffine(uint64(j), chChunk,  buckets[:], c, points, scalars)
+	}
+
+	for j := int(nbChunks - 1); j >0; j-- {
+		go processChunk(j, points, scalars, chChunks[j])
+	}
+
+	if !splitFirstChunk {
+		go processChunk(0, points, scalars, chChunks[0])
+	} else {
+		chSplit := make(chan {{ $.TJacobianExtended }}, 2)
+		split := len(points) / 2
+		go processChunk(0, points[:split], scalars[:split], chSplit)
+		go processChunk(0, points[split:], scalars[split:], chSplit)
+		go func() {
+			s1 := <-chSplit
+			s2 := <-chSplit
+			close(chSplit)
+			s1.add(&s2)
+			chChunks[0] <- s1
+		}()
+	}
+
+	return msmReduceChunk{{ $.TAffine }}BatchAffine(p, c, chChunks[:])
+}
+{{- end}}
+{{end}}
+
+
+{{end }}
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index d7e8ea021d..bbc5ec8980 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1568,3 +1568,82 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 		return toReturn
 	{{- end}}
 }
+
+
+
+// batch add/dbl in affine coordinates
+// using batch inversion
+// cost add: 5*batchSize M + 1I, dbl: +1M
+func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize int) {
+	if batchSize == 0 {
+		return
+	}
+	var isDbl [MAX_BATCH_SIZE]bool
+	var lambda [MAX_BATCH_SIZE]{{.CoordType}}
+
+	{
+		var lambdain [MAX_BATCH_SIZE]{{.CoordType}}
+
+
+		for j := 0; j < batchSize; j++ {
+			// detect dbl vs add & compute denominator
+			if P[j].Equal(R[j]) {
+				isDbl[j] = true
+				lambdain[j].Double(&P[j].Y)
+			} else {
+				lambdain[j].Sub(&P[j].X, &R[j].X)
+			}
+		}
+
+		// invert denominator
+		BatchInvert{{ $TAffine }}(&lambda, &lambdain, batchSize)
+
+	}
+
+	var d {{.CoordType}}
+	var rr {{ $TAffine }}
+
+	for j := 0; j < batchSize; j++ {
+		// computa lambda, distinguishing dbl / add
+		if isDbl[j] {
+			d.Square(&P[j].X)
+			lambda[j].Mul(&lambda[j], &d)
+			d.Double(&lambda[j])
+			lambda[j].Add(&lambda[j], &d)
+		} else {
+			d.Sub(&P[j].Y, &R[j].Y)
+			lambda[j].Mul(&lambda[j], &d)
+		}
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[j].X)
+		rr.X.Sub(&rr.X, &P[j].X)
+		d.Sub(&R[j].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[j].Y)
+		R[j].Set(&rr)
+	}
+}
+
+
+
+// batch inversion
+// similar to BatchInvert{{.CoordType}}, ignores edge cases
+func BatchInvert{{ $TAffine }}(res, a *[MAX_BATCH_SIZE]{{.CoordType}},  n int) {
+
+    var accumulator {{.CoordType}}
+	accumulator.SetOne()
+
+	for i := 0; i < n; i++ {
+		res[i] = accumulator
+		accumulator.Mul(&accumulator, &a[i])
+	}
+
+	accumulator.Inverse(&accumulator)
+
+	for i := n - 1; i >= 0; i-- {
+		res[i].Mul(&res[i], &accumulator)
+		accumulator.Mul(&accumulator, &a[i])
+	}
+}
\ No newline at end of file
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index fdf2a82d8e..cde8bd0b2a 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -196,6 +196,40 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	))
 
 
+	properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// multi exp points
+			var samplePoints [nbSamples]{{ $.TAffine }}
+			var g {{ $.TJacobian }}
+			g.Set(&{{ toLower .PointName}}Gen)
+			for i := 1; i <= nbSamples; i++ {
+				samplePoints[i-1].FromJacobian(&g)
+				g.AddAssign(&{{ toLower .PointName}}Gen)
+			}
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			for i := 1; i <= nbSamples; i++ {
+				sampleScalars[i-1].SetUint64(uint64(i)).
+					Mul(&sampleScalars[i-1], &mixer).
+					FromMont()
+			}
+
+            var result1, result2 {{ $.TJacobian }}
+			for _, c := range cRange {
+				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
+				msmInner{{ $.TJacobian }}(&result1, int(c), samplePoints[:], scalars, false)
+				msmInner{{ $.TJacobian }}BatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+                if !result1.Equal(&result2) {
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -256,12 +290,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 

From 853bfb1f8edaa3cd4b54a3975b390694b8f98858 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 7 Nov 2022 16:13:00 -0600
Subject: [PATCH 02/43] build: updated go.mod to go 1.18 req

---
 go.mod | 2 +-
 go.sum | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/go.mod b/go.mod
index 583ea31428..f1fd1fb56c 100644
--- a/go.mod
+++ b/go.mod
@@ -1,6 +1,6 @@
 module github.com/consensys/gnark-crypto
 
-go 1.17
+go 1.18
 
 require (
 	github.com/consensys/bavard v0.1.13
diff --git a/go.sum b/go.sum
index 24019a2212..a0175604ce 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,3 @@
-github.com/consensys/bavard v0.1.12 h1:rApQlUvBg5FeW/fnigtVnAs0sBrgDN2pEuHNdWElSUE=
-github.com/consensys/bavard v0.1.12/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
 github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
 github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
 github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
@@ -28,15 +26,8 @@ github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PK
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa h1:zuSxTR4o9y82ebqCUJYNGJbGPo6sKVl54f/TVDObg1c=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 h1:dyU22nBWzrmTQxtNrr4dzVOvaw35nUYE279vF9UmsI8=
 golang.org/x/sys v0.0.0-20220727055044-e65921a090b8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=

From 9d170efdb5942e65487a7c706f6964411adcdcb3 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 7 Nov 2022 22:30:43 -0600
Subject: [PATCH 03/43] feat: started to factorize msm impl through generics

---
 ecc/bls12-377/multiexp.go                     | 1964 +++-------------
 ecc/bls12-377/multiexp_affine.go              | 1986 ++++++-----------
 ecc/bls12-377/multiexp_test.go                |   10 +-
 ecc/bls12-378/multiexp.go                     | 1964 +++-------------
 ecc/bls12-378/multiexp_affine.go              | 1986 ++++++-----------
 ecc/bls12-378/multiexp_test.go                |   10 +-
 ecc/bls12-381/multiexp.go                     | 1964 +++-------------
 ecc/bls12-381/multiexp_affine.go              | 1986 ++++++-----------
 ecc/bls12-381/multiexp_test.go                |   10 +-
 ecc/bls24-315/multiexp.go                     | 1964 +++-------------
 ecc/bls24-315/multiexp_affine.go              | 1986 ++++++-----------
 ecc/bls24-315/multiexp_test.go                |   10 +-
 ecc/bls24-317/multiexp.go                     | 1964 +++-------------
 ecc/bls24-317/multiexp_affine.go              | 1986 ++++++-----------
 ecc/bls24-317/multiexp_test.go                |   10 +-
 ecc/bn254/multiexp.go                         | 1964 +++-------------
 ecc/bn254/multiexp_affine.go                  | 1986 ++++++-----------
 ecc/bn254/multiexp_test.go                    |   10 +-
 ecc/bw6-633/multiexp.go                       |  460 ++--
 ecc/bw6-633/multiexp_affine.go                |  540 ++++-
 ecc/bw6-633/multiexp_test.go                  |   10 +-
 ecc/bw6-756/multiexp.go                       |  474 ++--
 ecc/bw6-756/multiexp_affine.go                |  540 ++++-
 ecc/bw6-756/multiexp_test.go                  |   10 +-
 ecc/bw6-761/multiexp.go                       |  474 ++--
 ecc/bw6-761/multiexp_affine.go                |  540 ++++-
 ecc/bw6-761/multiexp_test.go                  |   10 +-
 internal/generator/ecc/generate.go            |   32 +-
 .../generator/ecc/template/multiexp.go.tmpl   |   49 +-
 .../ecc/template/multiexp_affine.go.tmpl      |  143 +-
 .../ecc/template/tests/multiexp.go.tmpl       |    4 +-
 31 files changed, 7524 insertions(+), 19522 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 6d4f14f13b..0f487a104c 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 876ae12f01..8ff827c422 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 9e40c04401..a14b7946f2 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index b42536f1b9..ebc19dc090 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index b92b826e91..4becea22f3 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 466e6499a1..c4acf67088 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 25a18a9457..a66bb3aa70 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 7970a61d7e..d9469a7fb1 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 05f44f6112..4248afb29d 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 0c3d0039b2..f97aa4e2f4 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index e08952a333..c2d52847f3 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 6f5611f563..a942e21132 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 5a1b6797d2..1950ae3ef6 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 6623e42510..1965405349 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index b89f8a2375..293f3ec6ef 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 4c120b4be4..c2ecab3d61 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,50 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -360,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -370,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -424,26 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -466,1719 +500,325 @@ func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk b
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExp(points, scalars, config); err != nil {
+		return nil, err
+	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the msmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	// for each msmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
+	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented msmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-func (p *G1Jac) msmC6(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
+	}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	switch c {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC7(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) msmC9(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the msmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each msmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.msmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.msmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.msmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.msmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.msmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.msmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.msmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.msmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG2Affine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC6(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 6                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC7(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 7                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC9(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 9                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+// msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
+	var _p g2JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return p.unsafeFromJacExtended(&_p)
 }
 
-func (p *G2Jac) msmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+		if bits == 0 {
+			continue
+		}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	chRes <- total
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -2201,5 +841,5 @@ func (p *G2Jac) msmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index d91b3cb89c..89db40edae 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,102 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
 	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
 	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
 	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
 	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
 	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
 	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
 	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
 	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
 	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
 	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
 	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -276,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -323,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -338,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -364,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -433,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -445,29 +454,25 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -490,1343 +495,521 @@ func (p *G1Jac) batchAffineMsmC10(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
-func (p *G1Jac) batchAffineMsmC11(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
+	var _p G2Jac
+	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
+		return nil, err
 	}
+	p.FromJacobian(&_p)
+	return p, nil
+}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+//
+// This call return an error if len(scalars) != len(points) or if provided config is invalid.
+func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
+	// note:
+	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
+	// duplicating (through template generation) these methods allows to declare the buckets on the stack
+	// the choice of c needs to be improved:
+	// there is a theoritical value that gives optimal asymptotics
+	// but in practice, other factors come into play, including:
+	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
+	// * number of CPUs
+	// * cache friendliness (which depends on the host, G1 or G2... )
+	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// for each batchAffineMsmCX
+	// step 1
+	// we compute, for each scalars over c-bit wide windows, nbChunk digits
+	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+	// 2^{c} to the current digit, making it negative.
+	// negative digits will be processed in the next step as adding -G into the bucket instead of G
+	// (computing -G is cheap, and this saves us half of the buckets)
+	// step 2
+	// buckets are declared on the stack
+	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
+	// we use jacobian extended formulas here as they are faster than mixed addition
+	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
+	// step 3
+	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+
+	// ensure len(points) == len(scalars)
+	nbPoints := len(points)
+	if nbPoints != len(scalars) {
+		return nil, errors.New("len(points) != len(scalars)")
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	// if nbTasks is not set, use all available CPUs
+	if config.NbTasks <= 0 {
+		config.NbTasks = runtime.NumCPU()
+	} else if config.NbTasks > 1024 {
+		return nil, errors.New("invalid config: config.NbTasks > 1024")
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// here, we compute the best C for nbPoints
+	// we split recursively until nbChunks(c) >= nbTasks,
+	bestC := func(nbPoints int) uint64 {
+		// implemented batchAffineMsmC methods (the c we use must be in this slice)
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		var C uint64
+		// approximate cost (in group operations)
+		// cost = bits/c * (nbPoints + 2^{c})
+		// this needs to be verified empirically.
+		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
+		min := math.MaxFloat64
+		for _, c := range implementedCs {
+			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cost := float64(cc) / float64(c)
+			if cost < min {
+				min = cost
+				C = c
+			}
+		}
+		// empirical, needs to be tuned.
+		// if C > 16 && nbPoints < 1 << 23 {
+		// 	C = 16
+		// }
+		return C
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	var C uint64
+	nbSplits := 1
+	nbChunks := 0
+	for nbChunks < config.NbTasks {
+		C = bestC(nbPoints)
+		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+		if (fr.Limbs*64)%C != 0 {
+			nbChunks++
+		}
+		nbChunks *= nbSplits
+		if nbChunks < config.NbTasks {
+			nbSplits <<= 1
+			nbPoints >>= 1
+		}
+	}
 
-func (p *G1Jac) batchAffineMsmC12(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	// partition the scalars
+	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
+	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
+	var smallValues int
+	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// if we have more than 10% of small values, we split the processing of the first chunk in 2
+	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
+	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	// we have nbSplits intermediate results that we must sum together.
+	_p := make([]G2Jac, nbSplits-1)
+	chDone := make(chan int, nbSplits-1)
+	for i := 0; i < nbSplits-1; i++ {
+		start := i * nbPoints
+		end := start + nbPoints
+		go func(start, end, i int) {
+			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			chDone <- i
+		}(start, end, i)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	for i := 0; i < nbSplits-1; i++ {
+		done := <-chDone
+		p.AddAssign(&_p[done])
 	}
+	close(chDone)
+	return p, nil
+}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	switch c {
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC13(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 4:
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 5:
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 8:
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
 
-func (p *G1Jac) batchAffineMsmC14(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 16:
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC15(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC20(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) batchAffineMsmC21(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G1Affine
-			msmProcessChunkG1AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g1JacExtended
-			msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
-
-	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
-
-	case 6:
-		p.msmC6(points, scalars, splitFirstChunk)
-
-	case 7:
-		p.msmC7(points, scalars, splitFirstChunk)
-
-	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
-
-	case 9:
-		p.msmC9(points, scalars, splitFirstChunk)
-
-	case 10:
-		p.batchAffineMsmC10(points, scalars, splitFirstChunk)
-
-	case 11:
-		p.batchAffineMsmC11(points, scalars, splitFirstChunk)
-
-	case 12:
-		p.batchAffineMsmC12(points, scalars, splitFirstChunk)
-
-	case 13:
-		p.batchAffineMsmC13(points, scalars, splitFirstChunk)
-
-	case 14:
-		p.batchAffineMsmC14(points, scalars, splitFirstChunk)
-
-	case 15:
-		p.batchAffineMsmC15(points, scalars, splitFirstChunk)
-
-	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
-
-	case 20:
-		p.batchAffineMsmC20(points, scalars, splitFirstChunk)
-
-	case 21:
-		p.batchAffineMsmC21(points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
-}
-
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	B := &b.buckets[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
-		if op.isNeg() {
-			B.Neg(P)
-		} else {
-			B.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
-	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func (p *G2Jac) batchAffineMsmC10(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 10                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC11(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 11                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC12(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 12                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC13(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 13                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) batchAffineMsmC14(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 14                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	}
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	default:
+		panic("not implemented")
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
 }
 
-func (p *G2Jac) batchAffineMsmC15(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 15                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
 	}
-
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	if batchSize <= 0 {
+		batchSize = 1
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
+}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
 	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
 	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
 	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+		b.P[b.cptP].Set(P)
 	}
-
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	b.cptP++
 }
 
-func (p *G2Jac) batchAffineMsmC20(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 20                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
 
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+}
 
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+	batch := newBatchG2Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
+	}
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG2Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	chRes <- total
+
 }
 
-func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 21                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -1834,29 +1017,25 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	// c doesn't divide 256, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	// TODO @gbotrel replace this in code generator
-	if lastC >= 10 {
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]G2Affine
-			msmProcessChunkG2AffineBatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
-	} else {
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
 		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			var buckets [1 << (lastC - 1)]g2JacExtended
-			msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -1879,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC21(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 61341020f0..8a8ee0e90d 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 2006674935..e49016a90e 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -391,161 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -568,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -630,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -703,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -736,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine(chunk uint64,
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
@@ -746,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -800,161 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64,
 
 }
 
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -977,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 79740b7a69..5064b454a6 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
+}
+
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
 }
 
 // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine{
+	return BatchG2Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG2Affine) IsFull() bool {
+func (b *BatchG2Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine) ExecuteAndReset() {
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine) Add(op batchOp) {
+func (b *BatchG2Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
 
 }
 
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG2Affine(buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 282b60e573..32d3f4e986 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 698835f344..2ab93e7fd9 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -391,168 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 384, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -575,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -637,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -710,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -743,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine(chunk uint64,
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
@@ -753,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -807,168 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64,
 
 }
 
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 384, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -991,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 427f4d3891..3b533e9059 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
+}
+
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
 }
 
 // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine{
+	return BatchG2Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG2Affine) IsFull() bool {
+func (b *BatchG2Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine) ExecuteAndReset() {
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine) Add(op batchOp) {
+func (b *BatchG2Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
 
 }
 
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG2Affine(buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index d101a5c9a6..6cbf26cdfa 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index e9cef54ee0..cfaef03004 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -221,7 +221,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -294,17 +294,74 @@ func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -327,9 +384,8 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG1Affine(chunk uint64,
+func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []g1JacExtended,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
@@ -337,6 +393,7 @@ func msmProcessChunkG1Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -391,168 +448,36 @@ func msmProcessChunkG1Affine(chunk uint64,
 
 }
 
-func (p *G1Jac) msmC4(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC5(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// c doesn't divide 384, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G1Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g1JacExtended
-		msmProcessChunkG1Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC8(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
-}
-
-func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]g1JacExtended
-		msmProcessChunkG1Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -575,7 +500,7 @@ func (p *G1Jac) msmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG1Affine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -637,7 +562,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -710,17 +635,74 @@ func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, spl
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.msmC16(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
@@ -743,9 +725,8 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine(chunk uint64,
+func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []g2JacExtended,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
@@ -753,6 +734,7 @@ func msmProcessChunkG2Affine(chunk uint64,
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
 
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -807,168 +789,36 @@ func msmProcessChunkG2Affine(chunk uint64,
 
 }
 
-func (p *G2Jac) msmC4(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 4                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC5(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 5                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks + 1]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// c doesn't divide 384, last window is smaller we can allocate less buckets
-	const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-	go func(j uint64, points []G2Affine, scalars []fr.Element) {
-		var buckets [1 << (lastC - 1)]g2JacExtended
-		msmProcessChunkG2Affine(j, chChunks[j], buckets[:], c, points, scalars)
-	}(uint64(nbChunks), points, scalars)
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
+func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC8(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 8                   // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
-}
-
-func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
 	}
 
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]g2JacExtended
-		msmProcessChunkG2Affine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -991,5 +841,5 @@ func (p *G2Jac) msmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk
 		}()
 	}
 
-	return msmReduceChunkG2Affine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 09004ae309..72d199f31f 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -93,7 +93,7 @@ func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -166,69 +166,111 @@ func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1AffineBatchAffine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG1Affine struct {
-	P               [MAX_BATCH_SIZE]G1Affine
-	R               [MAX_BATCH_SIZE]*G1Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G1Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G1Affine
+	buckets   *B
 }
 
-func newBatchG1Affine(buckets, points []G1Affine) BatchG1Affine {
-	batchSize := len(buckets) / 5
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG1Affine) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG1Affine) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -243,45 +285,45 @@ func (b *BatchG1Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG1Affine) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG1Affine) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -290,7 +332,7 @@ func (b *BatchG1Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -305,16 +347,15 @@ func processQueueG1Affine(queue []batchOp, batch *BatchG1Affine) []batchOp {
 
 }
 
-func msmProcessChunkG1AffineBatchAffine(chunk uint64,
+func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
-	buckets []G1Affine,
 	c uint64,
 	points []G1Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -331,7 +372,7 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(buckets, points)
+	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -400,11 +441,12 @@ func msmProcessChunkG1AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -412,14 +454,25 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g1JacExtended
+	chChunks := make([]chan g1JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G1Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g1JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		var buckets [1 << (c - 1)]G1Affine
-		msmProcessChunkG1AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -442,7 +495,106 @@ func (p *G1Jac) batchAffineMsmC16(points []G1Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG1AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
+}
+
+type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
+type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
+type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
+type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
+type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
+type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
+type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
+type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
+type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
+type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
+type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
+type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
+type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
+type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
+type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
+type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
+
+type ibG1Affine interface {
+	bucketG1AffineC1 |
+		bucketG1AffineC2 |
+		bucketG1AffineC3 |
+		bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC6 |
+		bucketG1AffineC7 |
+		bucketG1AffineC8 |
+		bucketG1AffineC9 |
+		bucketG1AffineC10 |
+		bucketG1AffineC11 |
+		bucketG1AffineC12 |
+		bucketG1AffineC13 |
+		bucketG1AffineC14 |
+		bucketG1AffineC15 |
+		bucketG1AffineC16 |
+		bucketG1AffineC17 |
+		bucketG1AffineC18 |
+		bucketG1AffineC19 |
+		bucketG1AffineC20 |
+		bucketG1AffineC21 |
+		bucketG1AffineC22 |
+		bucketG1AffineC23
+}
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC17 |
+		bucketg1JacExtendedC18 |
+		bucketg1JacExtendedC19 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21 |
+		bucketg1JacExtendedC22 |
+		bucketg1JacExtendedC23
 }
 
 // MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
@@ -504,7 +656,7 @@ func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, con
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -577,69 +729,111 @@ func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.E
 
 	switch c {
 
+	case 1:
+		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
+
+	case 2:
+		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
+
+	case 3:
+		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
+
 	case 4:
-		p.msmC4(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
 
 	case 5:
-		p.msmC5(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
+
+	case 6:
+		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
+
+	case 7:
+		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
 
 	case 8:
-		p.msmC8(points, scalars, splitFirstChunk)
+		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
+
+	case 9:
+		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
+
+	case 10:
+		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
+
+	case 11:
+		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
+
+	case 12:
+		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
+
+	case 13:
+		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
+
+	case 14:
+		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
+
+	case 15:
+		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
 
 	case 16:
-		p.batchAffineMsmC16(points, scalars, splitFirstChunk)
+		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+
+	case 17:
+		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+
+	case 18:
+		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+
+	case 19:
+		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+
+	case 20:
+		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+
+	case 21:
+		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+
+	case 22:
+		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+
+	case 23:
+		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
 
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG2AffineBatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG2AffineBatchAffine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
-	var _p g2JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-type BatchG2Affine struct {
-	P               [MAX_BATCH_SIZE]G2Affine
-	R               [MAX_BATCH_SIZE]*G2Affine
-	batchSize       int
-	cptP            int
-	bucketIds       map[uint32]struct{}
-	buckets, points []G2Affine
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-func newBatchG2Affine(buckets, points []G2Affine) BatchG2Affine {
-	batchSize := len(buckets) / 5
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine{
+	return BatchG2Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *BatchG2Affine) IsFull() bool {
+func (b *BatchG2Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine) ExecuteAndReset() {
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -654,45 +848,45 @@ func (b *BatchG2Affine) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine) CanAdd(bID uint32) bool {
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine) Add(op batchOp) {
+func (b *BatchG2Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -701,7 +895,7 @@ func (b *BatchG2Affine) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -716,16 +910,15 @@ func processQueueG2Affine(queue []batchOp, batch *BatchG2Affine) []batchOp {
 
 }
 
-func msmProcessChunkG2AffineBatchAffine(chunk uint64,
+func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
-	buckets []G2Affine,
 	c uint64,
 	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -742,7 +935,7 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG2Affine(buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -811,11 +1004,12 @@ func msmProcessChunkG2AffineBatchAffine(chunk uint64,
 
 }
 
-func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	const (
-		c        = 16                  // scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	)
+func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
+
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -823,14 +1017,25 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks]chan g2JacExtended
+	chChunks := make([]chan g2JacExtended, nbChunks)
 	for i := 0; i < len(chChunks); i++ {
 		chChunks[i] = make(chan g2JacExtended, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here.
+		go func(j uint64, points []G2Affine, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]g2JacExtended, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
+		}(uint64(nbChunks-1), points, scalars)
+		nbChunks--
+	}
+
 	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		var buckets [1 << (c - 1)]G2Affine
-		msmProcessChunkG2AffineBatchAffine(uint64(j), chChunk, buckets[:], c, points, scalars)
+		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j > 0; j-- {
@@ -853,5 +1058,104 @@ func (p *G2Jac) batchAffineMsmC16(points []G2Affine, scalars []fr.Element, split
 		}()
 	}
 
-	return msmReduceChunkG2AffineBatchAffine(p, c, chChunks[:])
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+}
+
+type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
+type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
+type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
+type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
+type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
+type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
+type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
+type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
+type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
+type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
+type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
+type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
+type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
+type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
+type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
+type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
+
+type ibG2Affine interface {
+	bucketG2AffineC1 |
+		bucketG2AffineC2 |
+		bucketG2AffineC3 |
+		bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC6 |
+		bucketG2AffineC7 |
+		bucketG2AffineC8 |
+		bucketG2AffineC9 |
+		bucketG2AffineC10 |
+		bucketG2AffineC11 |
+		bucketG2AffineC12 |
+		bucketG2AffineC13 |
+		bucketG2AffineC14 |
+		bucketG2AffineC15 |
+		bucketG2AffineC16 |
+		bucketG2AffineC17 |
+		bucketG2AffineC18 |
+		bucketG2AffineC19 |
+		bucketG2AffineC20 |
+		bucketG2AffineC21 |
+		bucketG2AffineC22 |
+		bucketG2AffineC23
+}
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC17 |
+		bucketg2JacExtendedC18 |
+		bucketg2JacExtendedC19 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21 |
+		bucketg2JacExtendedC22 |
+		bucketg2JacExtendedC23
 }
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index d5b1288c1e..8d851d2d42 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index bc367c5c14..6af6b7d54a 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -3,6 +3,7 @@ package ecc
 import (
 	"fmt"
 	"path/filepath"
+	"reflect"
 	"strings"
 	"text/template"
 
@@ -21,7 +22,36 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		{File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}},
 	}
 	conf.Package = packageName
-	if err := bgen.Generate(conf, packageName, "./ecc/template", entries...); err != nil {
+	funcs := make(template.FuncMap)
+	funcs["last"] = func(x int, a interface{}) bool {
+		return x == reflect.ValueOf(a).Len()-1
+	}
+	funcs["lastC"] = func(c int) int {
+		// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+		// if c divides fr.Limbs * 64;
+		n := (conf.Fr.NbWords * 64)
+		if n%c == 0 {
+			return c
+		}
+		return n - (c * (n / c))
+	}
+	funcs["contains"] = func(v int, s []int) bool {
+		for _, sv := range s {
+			if v == sv {
+				return true
+			}
+		}
+		return false
+	}
+	// TODO @gbotrel fix me. need to generate usual C, and missing lastC for bucket size.
+	conf.G1.CRange = make([]int, 23)
+	conf.G2.CRange = make([]int, 23)
+	for i := 0; i < len(conf.G1.CRange); i++ {
+		conf.G1.CRange[i] = i + 1
+		conf.G2.CRange[i] = i + 1
+	}
+	bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)}
+	if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil {
 		return err
 	}
 
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index e476f889b7..62b5f03f62 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -299,7 +299,7 @@ func msmInner{{ $.TJacobian }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffin
 	switch c {
 	{{range $c :=  $.CRange}}
 	case {{$c}}:
-		p.msmC{{$c}}(points, scalars, splitFirstChunk)
+		msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
 	{{end}}
 	default:
 		panic("not implemented")
@@ -323,9 +323,8 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan
 }
 
 
-func msmProcessChunk{{ $.TAffine }}(chunk uint64,
+func msmProcessChunk{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}](chunk uint64,
 	 chRes chan<- {{ $.TJacobianExtended }},
-	 buckets []{{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
 	 scalars []fr.Element) {
@@ -334,6 +333,7 @@ func msmProcessChunk{{ $.TAffine }}(chunk uint64,
 	mask  := uint64((1 << c) - 1)	// low c bits are 1
 	msbWindow  := uint64(1 << (c -1))
 
+	var buckets B
 	for i := 0 ; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -391,44 +391,41 @@ func msmProcessChunk{{ $.TAffine }}(chunk uint64,
 }
 
 
-{{range $c :=  $.CRange}}
-
-{{- $frBits := mul $.FrNbWords 64}}
-{{- $cDividesBits := divides $c $frBits}}
-{{- $nbChunks := div $frBits $c}}
-
-func (p *{{ $.TJacobian }}) msmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
-	const (
-		c  = {{$c}} 							// scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
-	)
 
+func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
+	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the 
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is 
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }}
+	chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks)
 	for i:=0; i < len(chChunks);i++ {
 		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
 	}
 
 
-	{{ if not $cDividesBits }}
 
-		// c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets
-		const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here. 
 		go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
-			var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }}
-			msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars)
-		}(uint64(nbChunks), points, scalars)
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1))
+			// TODO @gbotrel last C restore.
+			msmProcessChunk{{ $.TAffine }}[LB](j, chChunks[j],  c, points, scalars)
+		}(uint64(nbChunks - 1), points, scalars)
+		nbChunks--
+	}
 
-	{{- end}}
 
 	processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) {
-		var buckets [1<<(c-1)]{{ $.TJacobianExtended }}
-		msmProcessChunk{{ $.TAffine }}(uint64(j), chChunk,  buckets[:], c, points, scalars)
+		msmProcessChunk{{ $.TAffine }}[B](uint64(j), chChunk,  c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j >0; j-- {
@@ -452,9 +449,7 @@ func (p *{{ $.TJacobian }}) msmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.El
 	}
 
 
-	return msmReduceChunk{{ $.TAffine }}(p, c, chChunks[:])
+	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
 }
-{{end}}
-
 
 {{end }}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 02d8c72588..3a803280f3 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -27,7 +27,8 @@ func (o batchOp) isNeg() bool {
 
 
 {{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
-{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
+{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+
 
 
 {{define "multiexp" }}
@@ -169,9 +170,9 @@ func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []
 	{{range $c :=  $.CRange}}
 	case {{$c}}:
 		{{- if le $c 9}}
-		p.msmC{{$c}}(points, scalars, splitFirstChunk)
+		msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
 		{{- else}}
-		p.batchAffineMsmC{{$c}}(points, scalars, splitFirstChunk)
+		batch{{ $.TAffine }}Msm[bucket{{ $.TAffine }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
 		{{- end}}
 	{{end}}
 	default:
@@ -179,53 +180,38 @@ func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []
 	}
 }
 
-// msmReduceChunk{{ $.TAffine }}BatchAffine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunk{{ $.TAffine }}BatchAffine(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }})  *{{ $.TJacobian }} {
-	var _p {{ $.TJacobianExtended }}
-	totalj := <-chChunks[len(chChunks)-1]
-    _p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
 
-type Batch{{ $.TAffine }} struct {
+type Batch{{ $.TAffine }}[B ib{{ $.TAffine }}] struct {
 	P            [MAX_BATCH_SIZE]{{ $.TAffine }}
 	R            [MAX_BATCH_SIZE]*{{ $.TAffine }}
 	batchSize       int
 	cptP		      int
 	bucketIds       map[uint32]struct{}
-	buckets, points []{{ $.TAffine }}
+	points []{{ $.TAffine }}
+	buckets *B
 }
 
-func newBatch{{ $.TAffine }}(buckets, points []{{ $.TAffine }}) Batch{{ $.TAffine }} {
-	batchSize := len(buckets) / 5
+func newBatch{{ $.TAffine }}[B ib{{ $.TAffine }}](buckets *B, points []{{ $.TAffine }}) Batch{{ $.TAffine }}[B] {
+	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return Batch{{ $.TAffine }}{
+	return Batch{{ $.TAffine }}[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(buckets)/2),
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
 	}
 }
 
-func (b *Batch{{ $.TAffine }}) IsFull() bool {
+func (b *Batch{{ $.TAffine }}[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *Batch{{ $.TAffine }}) ExecuteAndReset() {
+func (b *Batch{{ $.TAffine }}[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
@@ -240,45 +226,45 @@ func (b *Batch{{ $.TAffine }}) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *Batch{{ $.TAffine }}) CanAdd(bID uint32) bool {
+func (b *Batch{{ $.TAffine }}[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *Batch{{ $.TAffine }}) Add(op batchOp) {
+func (b *Batch{{ $.TAffine }}[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-	B := &b.buckets[op.bucketID]
+	BK := &(*b.buckets)[op.bucketID]
 	P := &b.points[op.pointID>>1]
 	if P.IsInfinity() {
 		return
 	}
 	// handle special cases with inf or -P / P
-	if B.IsInfinity() {
+	if BK.IsInfinity() {
 		if op.isNeg() {
-			B.Neg(P)
+			BK.Neg(P)
 		} else {
-			B.Set(P)
+			BK.Set(P)
 		}
 		return
 	}
 	if op.isNeg() {
 		// if bucket == P --> -P == 0
-		if B.Equal(P) {
-			B.setInfinity()
+		if BK.Equal(P) {
+			BK.setInfinity()
 			return
 		}
 	} else {
 		// if bucket == -P, B == 0
-		if B.X.Equal(&P.X) && !B.Y.Equal(&P.Y) {
-			B.setInfinity()
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
 			return
 		}
 	}
 
 	// b.bucketIds[b.cptP] = op.bucketID
 	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = B
+	b.R[b.cptP] = BK
 	if op.isNeg() {
 		b.P[b.cptP].Neg(P)
 	} else {
@@ -287,7 +273,7 @@ func (b *Batch{{ $.TAffine }}) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) []batchOp {
+func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Batch{{ $.TAffine }}[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -302,16 +288,15 @@ func processQueue{{ $.TAffine }}(queue []batchOp, batch *Batch{{ $.TAffine }}) [
 
 }
 
-func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64,
+func msmProcessChunk{{ $.TAffine }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64,
 	 chRes chan<- {{ $.TJacobianExtended }},
-	 buckets []{{ $.TAffine }},
 	 c uint64,
 	 points []{{ $.TAffine }},
 	 scalars []fr.Element) {
 
 	mask  := uint64((1 << c) - 1)	// low c bits are 1
 	msbWindow := uint64(1 << (c - 1))
-
+	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
@@ -328,7 +313,7 @@ func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatch{{ $.TAffine }}(buckets, points)
+	batch := newBatch{{ $.TAffine }}(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -398,17 +383,13 @@ func msmProcessChunk{{ $.TAffine }}BatchAffine(chunk uint64,
 }
 
 
-{{range $c :=  $.CRange}}
-{{- if gt $c 9}}
-{{- $frBits := mul $.FrNbWords 64}}
-{{- $cDividesBits := divides $c $frBits}}
-{{- $nbChunks := div $frBits $c}}
 
-func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
-	const (
-		c  = {{$c}} 							// scalars partitioned into c-bit radixes
-		nbChunks = (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
-	)
+func batch{{ $.TAffine }}Msm[B ib{{ $.TAffine }}, J ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
+	
+	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -416,33 +397,26 @@ func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scal
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
-	var chChunks [nbChunks{{if not $cDividesBits }} + 1 {{end}} ]chan {{ $.TJacobianExtended }}
+	chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks)
 	for i:=0; i < len(chChunks);i++ {
 		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
 	}
 
+	if (fr.Limbs*64)%c != 0 {
+		// TODO @gbotrel not always needed to do ext jac here. 
+		go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
+			// var buckets LB
+			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
+			// buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1))
+			// TODO @gbotrel lastC restore.
+			msmProcessChunk{{ $.TAffine }}[J](j, chChunks[j],  c, points, scalars)
+		}(uint64(nbChunks - 1), points, scalars)
+		nbChunks--
+	}
 
-	{{ if not $cDividesBits }}
-
-		// c doesn't divide {{$frBits}}, last window is smaller we can allocate less buckets
-		const lastC = (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-		// TODO @gbotrel replace this in code generator 
-		if lastC >= 10 {
-			go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
-				var buckets [1<<(lastC-1)]{{ $.TAffine }}
-				msmProcessChunk{{ $.TAffine }}BatchAffine(j, chChunks[j], buckets[:], c, points, scalars)
-			}(uint64(nbChunks), points, scalars)
-		} else {
-			go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
-				var buckets [1<<(lastC-1)]{{ $.TJacobianExtended }}
-				msmProcessChunk{{ $.TAffine }}(j, chChunks[j], buckets[:], c, points, scalars)
-			}(uint64(nbChunks), points, scalars)
-		}
-	{{- end}}
 
 	processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) {
-		var buckets [1<<(c-1)]{{ $.TAffine }}
-		msmProcessChunk{{ $.TAffine }}BatchAffine(uint64(j), chChunk,  buckets[:], c, points, scalars)
+		msmProcessChunk{{ $.TAffine }}BatchAffine[B](uint64(j), chChunk,  c, points, scalars)
 	}
 
 	for j := int(nbChunks - 1); j >0; j-- {
@@ -465,10 +439,29 @@ func (p *{{ $.TJacobian }}) batchAffineMsmC{{$c}}(points []{{ $.TAffine }}, scal
 		}()
 	}
 
-	return msmReduceChunk{{ $.TAffine }}BatchAffine(p, c, chChunks[:])
+	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
 }
+
+
+
+
+{{- range $c :=  $.CRange}}
+type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }}
+{{- end}}
+{{- range $c :=  $.CRange}}
+type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
 {{- end}}
-{{end}}
 
+type ib{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+}
+
+type ib{{ $.TJacobianExtended }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+}
 
 {{end }}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index cde8bd0b2a..93e26a09d6 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			r16.msmC16(samplePoints[:], scalars16, true)
+			msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C16, bucket{{ $.TJacobianExtended }}C{{lastC 16}}](&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -287,7 +287,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 5; i <= pow; i++ {
+	for i := 11; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {

From 5edbf300a5f06062a64953552e52dcc9071aa924 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 8 Nov 2022 14:36:18 -0600
Subject: [PATCH 04/43] feat,style: factorize code between extjac and affine
 msm using generics

---
 ecc/bls12-377/multiexp.go                     | 760 ++++++---------
 ecc/bls12-377/multiexp_affine.go              | 920 +++---------------
 ecc/bls12-377/multiexp_jacobian.go            | 229 +++++
 ecc/bls12-377/multiexp_test.go                |  34 +-
 ecc/bls12-378/multiexp.go                     | 760 ++++++---------
 ecc/bls12-378/multiexp_affine.go              | 920 +++---------------
 ecc/bls12-378/multiexp_jacobian.go            | 229 +++++
 ecc/bls12-378/multiexp_test.go                |  34 +-
 ecc/bls12-381/multiexp.go                     | 760 ++++++---------
 ecc/bls12-381/multiexp_affine.go              | 920 +++---------------
 ecc/bls12-381/multiexp_jacobian.go            | 229 +++++
 ecc/bls12-381/multiexp_test.go                |  34 +-
 ecc/bls24-315/multiexp.go                     | 760 ++++++---------
 ecc/bls24-315/multiexp_affine.go              | 920 +++---------------
 ecc/bls24-315/multiexp_jacobian.go            | 229 +++++
 ecc/bls24-315/multiexp_test.go                |  34 +-
 ecc/bls24-317/multiexp.go                     | 760 ++++++---------
 ecc/bls24-317/multiexp_affine.go              | 920 +++---------------
 ecc/bls24-317/multiexp_jacobian.go            | 229 +++++
 ecc/bls24-317/multiexp_test.go                |  34 +-
 ecc/bn254/multiexp.go                         | 760 ++++++---------
 ecc/bn254/multiexp_affine.go                  | 920 +++---------------
 ecc/bn254/multiexp_jacobian.go                | 229 +++++
 ecc/bn254/multiexp_test.go                    |  34 +-
 ecc/bw6-633/multiexp.go                       | 710 +++++---------
 ecc/bw6-633/multiexp_affine.go                | 920 +++---------------
 ecc/bw6-633/multiexp_jacobian.go              | 177 ++++
 ecc/bw6-633/multiexp_test.go                  |  34 +-
 ecc/bw6-756/multiexp.go                       | 712 +++++---------
 ecc/bw6-756/multiexp_affine.go                | 920 +++---------------
 ecc/bw6-756/multiexp_jacobian.go              | 177 ++++
 ecc/bw6-756/multiexp_test.go                  |  34 +-
 ecc/bw6-761/multiexp.go                       | 712 +++++---------
 ecc/bw6-761/multiexp_affine.go                | 920 +++---------------
 ecc/bw6-761/multiexp_jacobian.go              | 177 ++++
 ecc/bw6-761/multiexp_test.go                  |  34 +-
 internal/generator/config/curve.go            |  20 +-
 internal/generator/ecc/generate.go            |  29 +-
 .../generator/ecc/template/multiexp.go.tmpl   | 212 ++--
 .../ecc/template/multiexp_affine.go.tmpl      | 406 ++------
 .../ecc/template/multiexp_jacobian.go.tmpl    | 106 ++
 .../ecc/template/tests/multiexp.go.tmpl       |  16 +-
 42 files changed, 6040 insertions(+), 11934 deletions(-)
 create mode 100644 ecc/bls12-377/multiexp_jacobian.go
 create mode 100644 ecc/bls12-378/multiexp_jacobian.go
 create mode 100644 ecc/bls12-381/multiexp_jacobian.go
 create mode 100644 ecc/bls24-315/multiexp_jacobian.go
 create mode 100644 ecc/bls24-317/multiexp_jacobian.go
 create mode 100644 ecc/bn254/multiexp_jacobian.go
 create mode 100644 ecc/bw6-633/multiexp_jacobian.go
 create mode 100644 ecc/bw6-756/multiexp_jacobian.go
 create mode 100644 ecc/bw6-761/multiexp_jacobian.go
 create mode 100644 internal/generator/ecc/template/multiexp_jacobian.go.tmpl

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 0f487a104c..1673861355 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 8ff827c422..ac8d41cbb6 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bls12377
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
new file mode 100644
index 0000000000..fc89ebd2cc
--- /dev/null
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12377
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index a14b7946f2..7882874fda 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index ebc19dc090..862cca829b 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 4becea22f3..583761fe76 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bls12378
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
new file mode 100644
index 0000000000..a26fe93845
--- /dev/null
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12378
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index c4acf67088..8a80c9d1f8 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index a66bb3aa70..d926dc8e2e 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index d9469a7fb1..36695009a0 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bls12381
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
new file mode 100644
index 0000000000..a4e61348b7
--- /dev/null
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls12381
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 4248afb29d..15cd0f5304 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index f97aa4e2f4..3686207518 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index c2d52847f3..f1cdcfe574 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bls24315
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
new file mode 100644
index 0000000000..4399395829
--- /dev/null
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls24315
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index a942e21132..6a17d03fb4 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 1950ae3ef6..2cc4feb7fd 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 1965405349..916d8beced 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bls24317
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
new file mode 100644
index 0000000000..d948e2c697
--- /dev/null
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bls24317
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 293f3ec6ef..7e39930e23 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index c2ecab3d61..e519895eb5 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +237,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +389,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +399,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +413,120 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	case 20:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	case 21:
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +545,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 89db40edae..8b10c9786f 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bn254
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,320 +30,13 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
-}
-
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
-
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -441,66 +130,8 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
 type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
 type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
@@ -514,42 +145,11 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
 type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
 type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
 
 type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
+	bucketG1AffineC4 |
 		bucketG1AffineC5 |
 		bucketG1AffineC6 |
 		bucketG1AffineC7 |
@@ -562,258 +162,21 @@ type ibG1Affine interface {
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
 		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
 		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC1](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC1](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC4](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC4](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC4](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC6](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC3](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC4](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC9](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC4](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC1](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC1](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC4](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC9](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC16](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC4](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC14](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC3](p, 23, points, scalars, splitFirstChunk)
-
-	default:
-		panic("not implemented")
-	}
+		bucketG1AffineC21
 }
 
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
+type BatchG1Affine[B ibG1Affine] struct {
+	P         [MAX_BATCH_SIZE]G1Affine
+	R         [MAX_BATCH_SIZE]*G1Affine
 	batchSize int
 	cptP      int
 	bucketIds map[uint32]struct{}
-	points    []G2Affine
+	points    []G1Affine
 	buckets   *B
 }
 
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
 	batchSize := len(*buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -821,7 +184,7 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG2Affine[B]{
+	return BatchG1Affine[B]{
 		buckets:   buckets,
 		points:    points,
 		batchSize: batchSize,
@@ -829,18 +192,18 @@ func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine
 	}
 }
 
-func (b *BatchG2Affine[B]) IsFull() bool {
+func (b *BatchG1Affine[B]) IsFull() bool {
 	return b.cptP == b.batchSize
 }
 
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
+func (b *BatchG1Affine[B]) ExecuteAndReset() {
 	if b.cptP == 0 {
 		return
 	}
 	// for i := 0; i < len(b.R); i++ {
 	// 	b.R[i].Add(b.R[i], b.P[i])
 	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
 	for k := range b.bucketIds {
 		delete(b.bucketIds, k)
 	}
@@ -848,12 +211,12 @@ func (b *BatchG2Affine[B]) ExecuteAndReset() {
 	b.cptP = 0
 }
 
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
 	_, ok := b.bucketIds[bID]
 	return !ok
 }
 
-func (b *BatchG2Affine[B]) Add(op batchOp) {
+func (b *BatchG1Affine[B]) Add(op batchOp) {
 	// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 	BK := &(*b.buckets)[op.bucketID]
@@ -895,7 +258,7 @@ func (b *BatchG2Affine[B]) Add(op batchOp) {
 	b.cptP++
 }
 
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
 	for i := len(queue) - 1; i >= 0; i-- {
 		if batch.CanAdd(queue[i].bucketID) {
 			batch.Add(queue[i])
@@ -910,7 +273,13 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 
 }
 
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -1004,66 +373,8 @@ func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
 
 }
 
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
 type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
 type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
 type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
@@ -1077,42 +388,11 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
 type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
 type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
 
 type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
+	bucketG2AffineC4 |
 		bucketG2AffineC5 |
 		bucketG2AffineC6 |
 		bucketG2AffineC7 |
@@ -1125,37 +405,113 @@ type ibG2Affine interface {
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
 		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
 		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
+		bucketG2AffineC21
+}
+
+type BatchG2Affine[B ibG2Affine] struct {
+	P         [MAX_BATCH_SIZE]G2Affine
+	R         [MAX_BATCH_SIZE]*G2Affine
+	batchSize int
+	cptP      int
+	bucketIds map[uint32]struct{}
+	points    []G2Affine
+	buckets   *B
 }
 
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
+func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
+	batchSize := len(*buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	return BatchG2Affine[B]{
+		buckets:   buckets,
+		points:    points,
+		batchSize: batchSize,
+		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
+	}
+}
+
+func (b *BatchG2Affine[B]) IsFull() bool {
+	return b.cptP == b.batchSize
+}
+
+func (b *BatchG2Affine[B]) ExecuteAndReset() {
+	if b.cptP == 0 {
+		return
+	}
+	// for i := 0; i < len(b.R); i++ {
+	// 	b.R[i].Add(b.R[i], b.P[i])
+	// }
+	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
+	for k := range b.bucketIds {
+		delete(b.bucketIds, k)
+	}
+	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
+	b.cptP = 0
+}
+
+func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
+	_, ok := b.bucketIds[bID]
+	return !ok
+}
+
+func (b *BatchG2Affine[B]) Add(op batchOp) {
+	// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+	BK := &(*b.buckets)[op.bucketID]
+	P := &b.points[op.pointID>>1]
+	if P.IsInfinity() {
+		return
+	}
+	// handle special cases with inf or -P / P
+	if BK.IsInfinity() {
+		if op.isNeg() {
+			BK.Neg(P)
+		} else {
+			BK.Set(P)
+		}
+		return
+	}
+	if op.isNeg() {
+		// if bucket == P --> -P == 0
+		if BK.Equal(P) {
+			BK.setInfinity()
+			return
+		}
+	} else {
+		// if bucket == -P, B == 0
+		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
+			BK.setInfinity()
+			return
+		}
+	}
+
+	// b.bucketIds[b.cptP] = op.bucketID
+	b.bucketIds[op.bucketID] = struct{}{}
+	b.R[b.cptP] = BK
+	if op.isNeg() {
+		b.P[b.cptP].Neg(P)
+	} else {
+		b.P[b.cptP].Set(P)
+	}
+	b.cptP++
+}
+
+func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
+	for i := len(queue) - 1; i >= 0; i-- {
+		if batch.CanAdd(queue[i].bucketID) {
+			batch.Add(queue[i])
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+			}
+			queue[i] = queue[len(queue)-1]
+			queue = queue[:len(queue)-1]
+		}
+	}
+	return queue
+
 }
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
new file mode 100644
index 0000000000..4939af44c8
--- /dev/null
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -0,0 +1,229 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bn254
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
+type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
+type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
+type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
+type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
+		bucketg1JacExtendedC7 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC9 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC13 |
+		bucketg1JacExtendedC14 |
+		bucketg1JacExtendedC15 |
+		bucketg1JacExtendedC16 |
+		bucketg1JacExtendedC20 |
+		bucketg1JacExtendedC21
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
+type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
+type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
+type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
+type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
+		bucketg2JacExtendedC7 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC9 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC13 |
+		bucketg2JacExtendedC14 |
+		bucketg2JacExtendedC15 |
+		bucketg2JacExtendedC16 |
+		bucketg2JacExtendedC20 |
+		bucketg2JacExtendedC21
+}
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 8a8ee0e90d..7fbb203ce1 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index e49016a90e..ceb1ad7847 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,34 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +192,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +344,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +354,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +368,75 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +455,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 5064b454a6..221079f874 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bw6633
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
 
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
+	batch := newBatchG1Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
+		if bits == 0 {
+			continue
 		}
-	}
 
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
 	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
+	// flush items in batch.
+	batch.ExecuteAndReset()
 
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
 
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
+	chRes <- total
 
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
+}
 
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG1Affine interface {
+	bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC8 |
+		bucketG1AffineC16
 }
 
 type BatchG1Affine[B ibG1Affine] struct {
@@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]
 
 }
 
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
-	chRes chan<- g1JacExtended,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
 	c uint64,
-	points []G1Affine,
+	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
@@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
 	// batch.ExecuteAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
+		queue = processQueueG2Affine(queue, &batch)
 		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
@@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	var runningSum, total g1JacExtended
+	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
@@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
-
-type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
-		bucketG1AffineC11 |
-		bucketG1AffineC12 |
-		bucketG1AffineC13 |
-		bucketG1AffineC14 |
-		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC2](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC5](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC2](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC5](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC5](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC10](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC1](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC8](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC8](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC12](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC5](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC14](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC14](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC16](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC20](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC5](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC12](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC21](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG2Affine interface {
+	bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC8 |
+		bucketG2AffineC16
 }
 
 type BatchG2Affine[B ibG2Affine] struct {
@@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 	return queue
 
 }
-
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
-
-type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
-		bucketG2AffineC11 |
-		bucketG2AffineC12 |
-		bucketG2AffineC13 |
-		bucketG2AffineC14 |
-		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
-}
-
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
-}
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
new file mode 100644
index 0000000000..f331d07491
--- /dev/null
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -0,0 +1,177 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6633
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC16
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC16
+}
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 32d3f4e986..0a8268cd6a 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 8, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 2ab93e7fd9..ee5ff35a9a 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,35 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +193,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +300,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +345,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +355,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +369,76 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +457,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 3b533e9059..537221cb69 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bw6756
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
 
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
+	batch := newBatchG1Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
+		if bits == 0 {
+			continue
 		}
-	}
 
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
 	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+	// flush items in batch.
+	batch.ExecuteAndReset()
 
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
 
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+	chRes <- total
 
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+}
 
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG1Affine interface {
+	bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC8 |
+		bucketG1AffineC16
 }
 
 type BatchG1Affine[B ibG1Affine] struct {
@@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]
 
 }
 
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
-	chRes chan<- g1JacExtended,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
 	c uint64,
-	points []G1Affine,
+	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
@@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
 	// batch.ExecuteAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
+		queue = processQueueG2Affine(queue, &batch)
 		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
@@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	var runningSum, total g1JacExtended
+	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
@@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
-
-type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
-		bucketG1AffineC11 |
-		bucketG1AffineC12 |
-		bucketG1AffineC13 |
-		bucketG1AffineC14 |
-		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG2Affine interface {
+	bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC8 |
+		bucketG2AffineC16
 }
 
 type BatchG2Affine[B ibG2Affine] struct {
@@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 	return queue
 
 }
-
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
-
-type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
-		bucketG2AffineC11 |
-		bucketG2AffineC12 |
-		bucketG2AffineC13 |
-		bucketG2AffineC14 |
-		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
-}
-
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
-}
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
new file mode 100644
index 0000000000..9dc8862130
--- /dev/null
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -0,0 +1,177 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6756
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC16
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC16
+}
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 6cbf26cdfa..0d0384701c 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 8, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index cfaef03004..d9da35c23d 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -25,143 +25,6 @@ import (
 	"runtime"
 )
 
-// selector stores the index, mask and shifts needed to select bits from a scalar
-// it is used during the multiExp algorithm or the batch scalar multiplication
-type selector struct {
-	index uint64 // index in the multi-word scalar to select bits from
-	mask  uint64 // mask (c-bit wide)
-	shift uint64 // shift needed to get our bits on low positions
-
-	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
-	maskHigh        uint64 // same than mask, for index+1
-	shiftHigh       uint64 // same than shift, for index+1
-}
-
-// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
-// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-// 2^{c} to the current digit, making it negative.
-// negative digits can be processed in a later step as adding -G into the bucket instead of G
-// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
-// scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
-	toReturn := make([]fr.Element, len(scalars))
-
-	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	mask := uint64((1 << c) - 1)      // low c bits are 1
-	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
-	max := int(1 << (c - 1))          // max value we want for our digits
-	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
-
-	// compute offset and word selector / shift to select the right bits of our windows
-	selectors := make([]selector, nbChunks)
-	for chunk := uint64(0); chunk < nbChunks; chunk++ {
-		jc := uint64(chunk * c)
-		d := selector{}
-		d.index = jc / 64
-		d.shift = jc - (d.index * 64)
-		d.mask = mask << d.shift
-		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
-		if d.multiWordSelect {
-			nbBitsHigh := d.shift - uint64(64-c)
-			d.maskHigh = (1 << nbBitsHigh) - 1
-			d.shiftHigh = (c - nbBitsHigh)
-		}
-		selectors[chunk] = d
-	}
-
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
-
-	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
-		for i := start; i < end; i++ {
-			var carry int
-
-			scalar := scalars[i]
-			if scalarsMont {
-				scalar.FromMont()
-			}
-			if scalar.FitsOnOneWord() {
-				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
-			}
-
-			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
-				s := selectors[chunk]
-
-				// init with carry if any
-				digit := carry
-				carry = 0
-
-				// digit = value of the c-bit window
-				digit += int((scalar[s.index] & s.mask) >> s.shift)
-
-				if s.multiWordSelect {
-					// we are selecting bits over 2 words
-					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
-				}
-
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
-				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
-					digit -= (1 << c)
-					carry = 1
-				}
-
-				var bits uint64
-				if digit >= 0 {
-					bits = uint64(digit)
-				} else {
-					bits = uint64(-digit-1) | msbWindow
-				}
-
-				toReturn[i][s.index] |= (bits << s.shift)
-				if s.multiWordSelect {
-					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				}
-
-			}
-		}
-
-		chSmallValues <- smallValues
-
-	}, nbTasks)
-
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
-	}
-	return toReturn, smallValues
-}
-
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -221,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -266,7 +129,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -276,12 +139,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG1Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG1Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -290,169 +153,35 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG1Jac(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG1Affine[bucketg1JacExtendedC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG1Affine[bucketg1JacExtendedC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG1Affine[bucketg1JacExtendedC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG1Affine[bucketg1JacExtendedC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG1Affine[bucketg1JacExtendedC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG1Affine[bucketg1JacExtendedC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
+		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		msmCG1Affine[bucketg1JacExtendedC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		msmCG1Affine[bucketg1JacExtendedC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		msmCG1Affine[bucketg1JacExtendedC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		msmCG1Affine[bucketg1JacExtendedC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		msmCG1Affine[bucketg1JacExtendedC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		msmCG1Affine[bucketg1JacExtendedC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		msmCG1Affine[bucketg1JacExtendedC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
-	var _p g1JacExtended
-	totalj := <-chChunks[len(chChunks)-1]
-	_p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-func msmProcessChunkG1Affine[B ibg1JacExtended](chunk uint64,
-	chRes chan<- g1JacExtended,
-	c uint64,
-	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g1JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
+
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
 	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
@@ -464,45 +193,54 @@ func msmCG1Affine[B ibg1JacExtended, LB ibg1JacExtended](p *G1Jac, c uint64, poi
 		chChunks[i] = make(chan g1JacExtended, 1)
 	}
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG1Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1Affine[B](uint64(j), chChunk, c, points, scalars)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
 	}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g1JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
+	var _p g1JacExtended
+	totalj := <-chChunks[len(chChunks)-1]
+	_p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
+	}
+
+	return p.unsafeFromJacExtended(&_p)
+}
+
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
 //
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
@@ -562,7 +300,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
+		implementedCs := []uint64{4, 5, 8, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -607,7 +345,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2Jac , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
 	// we have nbSplits intermediate results that we must sum together.
@@ -617,12 +355,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		start := i * nbPoints
 		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInnerG2Jac(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
 
-	msmInnerG2Jac(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
+	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
 	for i := 0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
@@ -631,82 +369,76 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func msmInnerG2Jac(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
 	switch c {
 
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
 	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		msmCG2Affine[bucketg2JacExtendedC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		msmCG2Affine[bucketg2JacExtendedC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		msmCG2Affine[bucketg2JacExtendedC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		msmCG2Affine[bucketg2JacExtendedC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		msmCG2Affine[bucketg2JacExtendedC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		msmCG2Affine[bucketg2JacExtendedC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
+		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+	default:
+		panic("not implemented")
+	}
+}
 
-	case 17:
-		msmCG2Affine[bucketg2JacExtendedC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
 
-	case 18:
-		msmCG2Affine[bucketg2JacExtendedC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
 
-	case 19:
-		msmCG2Affine[bucketg2JacExtendedC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
+	// critical for performance
 
-	case 20:
-		msmCG2Affine[bucketg2JacExtendedC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
 
-	case 21:
-		msmCG2Affine[bucketg2JacExtendedC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
-	case 22:
-		msmCG2Affine[bucketg2JacExtendedC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+	for j := int(nbChunks - 2); j > 0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-	case 23:
-		msmCG2Affine[bucketg2JacExtendedC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0, chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan g2JacExtended, 2)
+			split := len(points) / 2
+			go processChunk(0, chSplit, c, points[:split], scalars[:split])
+			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
 
-	default:
-		panic("not implemented")
 	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
@@ -725,121 +457,139 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	return p.unsafeFromJacExtended(&_p)
 }
 
-func msmProcessChunkG2Affine[B ibg2JacExtended](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
+// selector stores the index, mask and shifts needed to select bits from a scalar
+// it is used during the multiExp algorithm or the batch scalar multiplication
+type selector struct {
+	index uint64 // index in the multi-word scalar to select bits from
+	mask  uint64 // mask (c-bit wide)
+	shift uint64 // shift needed to get our bits on low positions
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	multiWordSelect bool   // set to true if we need to select bits from 2 words (case where c doesn't divide 64)
+	maskHigh        uint64 // same than mask, for index+1
+	shiftHigh       uint64 // same than shift, for index+1
+}
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+	toReturn := make([]fr.Element, len(scalars))
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
 	}
 
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
+	mask := uint64((1 << c) - 1)      // low c bits are 1
+	msbWindow := uint64(1 << (c - 1)) // msb of the c-bit window
+	max := int(1 << (c - 1))          // max value we want for our digits
+	cDivides64 := (64 % c) == 0       // if c doesn't divide 64, we may need to select over multiple words
 
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
 		}
+		selectors[chunk] = d
 	}
 
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
 
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
 
-	chRes <- total
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
 
-}
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
 
-func msmCG2Affine[B ibg2JacExtended, LB ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+				// init with carry if any
+				digit := carry
+				carry = 0
 
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
 
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunkG2Affine[LB](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
 
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2Affine[B](uint64(j), chChunk, c, points, scalars)
-	}
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
 
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+				var bits uint64
+				if digit >= 0 {
+					bits = uint64(digit)
+				} else {
+					bits = uint64(-digit-1) | msbWindow
+				}
 
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
+				toReturn[i][s.index] |= (bits << s.shift)
+				if s.multiWordSelect {
+					toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				}
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
 }
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 72d199f31f..f012110b80 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -17,11 +17,7 @@
 package bw6761
 
 import (
-	"errors"
-	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -34,210 +30,118 @@ func (o batchOp) isNeg() bool {
 	return o.pointID&1 == 1
 }
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
+// processChunkG1BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
 //
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Affine) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Affine, error) {
-	var _p G1Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G1Jac) MultiExpBatchAffine(points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G1Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
 
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
+	batch := newBatchG1Affine(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
+		if bits == 0 {
+			continue
 		}
-	}
 
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG1JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG1JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
+		}
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
+		}
 	}
-
-	msmInnerG1JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueueG1Affine(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG1JacBatchAffine(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG1Affine[bucketg1JacExtendedC1, bucketg1JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG1Affine[bucketg1JacExtendedC2, bucketg1JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG1Affine[bucketg1JacExtendedC3, bucketg1JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG1Affine[bucketg1JacExtendedC4, bucketg1JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG1Affine[bucketg1JacExtendedC5, bucketg1JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG1Affine[bucketg1JacExtendedC6, bucketg1JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG1Affine[bucketg1JacExtendedC7, bucketg1JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG1Affine[bucketg1JacExtendedC8, bucketg1JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
 
-	case 9:
-		msmCG1Affine[bucketg1JacExtendedC9, bucketg1JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG1AffineMsm[bucketG1AffineC10, bucketg1JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG1AffineMsm[bucketG1AffineC11, bucketg1JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG1AffineMsm[bucketG1AffineC12, bucketg1JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG1AffineMsm[bucketG1AffineC13, bucketg1JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG1AffineMsm[bucketG1AffineC14, bucketg1JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG1AffineMsm[bucketG1AffineC15, bucketg1JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG1AffineMsm[bucketG1AffineC16, bucketg1JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG1AffineMsm[bucketG1AffineC17, bucketg1JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG1AffineMsm[bucketG1AffineC18, bucketg1JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
+	// flush items in batch.
+	batch.ExecuteAndReset()
 
-	case 19:
-		batchG1AffineMsm[bucketG1AffineC19, bucketg1JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	case 20:
-		batchG1AffineMsm[bucketG1AffineC20, bucketg1JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
 
-	case 21:
-		batchG1AffineMsm[bucketG1AffineC21, bucketg1JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
+	chRes <- total
 
-	case 22:
-		batchG1AffineMsm[bucketG1AffineC22, bucketg1JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
+}
 
-	case 23:
-		batchG1AffineMsm[bucketG1AffineC23, bucketg1JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
+type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
+type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
+type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG1Affine interface {
+	bucketG1AffineC4 |
+		bucketG1AffineC5 |
+		bucketG1AffineC8 |
+		bucketG1AffineC16
 }
 
 type BatchG1Affine[B ibG1Affine] struct {
@@ -347,10 +251,16 @@ func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]
 
 }
 
-func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
-	chRes chan<- g1JacExtended,
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
 	c uint64,
-	points []G1Affine,
+	points []G2Affine,
 	scalars []fr.Element) {
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
@@ -372,7 +282,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
 	for i := 0; i < len(scalars); i++ {
@@ -417,7 +327,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
 	// batch.ExecuteAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
+		queue = processQueueG2Affine(queue, &batch)
 		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
@@ -427,7 +337,7 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
-	var runningSum, total g1JacExtended
+	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
@@ -441,366 +351,18 @@ func msmProcessChunkG1AffineBatchAffine[B ibG1Affine](chunk uint64,
 
 }
 
-func batchG1AffineMsm[B ibG1Affine, J ibg1JacExtended](p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) *G1Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G1Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g1JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG1Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G1Affine, scalars []fr.Element, chChunk chan g1JacExtended) {
-		msmProcessChunkG1AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g1JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
-type bucketG1AffineC1 [1 << (1 - 1)]G1Affine
-type bucketG1AffineC2 [1 << (2 - 1)]G1Affine
-type bucketG1AffineC3 [1 << (3 - 1)]G1Affine
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC17 [1 << (17 - 1)]G1Affine
-type bucketG1AffineC18 [1 << (18 - 1)]G1Affine
-type bucketG1AffineC19 [1 << (19 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
-type bucketG1AffineC22 [1 << (22 - 1)]G1Affine
-type bucketG1AffineC23 [1 << (23 - 1)]G1Affine
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC17 [1 << (17 - 1)]g1JacExtended
-type bucketg1JacExtendedC18 [1 << (18 - 1)]g1JacExtended
-type bucketg1JacExtendedC19 [1 << (19 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
-type bucketg1JacExtendedC22 [1 << (22 - 1)]g1JacExtended
-type bucketg1JacExtendedC23 [1 << (23 - 1)]g1JacExtended
-
-type ibG1Affine interface {
-	bucketG1AffineC1 |
-		bucketG1AffineC2 |
-		bucketG1AffineC3 |
-		bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
-		bucketG1AffineC11 |
-		bucketG1AffineC12 |
-		bucketG1AffineC13 |
-		bucketG1AffineC14 |
-		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC17 |
-		bucketG1AffineC18 |
-		bucketG1AffineC19 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21 |
-		bucketG1AffineC22 |
-		bucketG1AffineC23
-}
-
-type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC4 |
-		bucketg1JacExtendedC5 |
-		bucketg1JacExtendedC6 |
-		bucketg1JacExtendedC7 |
-		bucketg1JacExtendedC8 |
-		bucketg1JacExtendedC9 |
-		bucketg1JacExtendedC10 |
-		bucketg1JacExtendedC11 |
-		bucketg1JacExtendedC12 |
-		bucketg1JacExtendedC13 |
-		bucketg1JacExtendedC14 |
-		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC17 |
-		bucketg1JacExtendedC18 |
-		bucketg1JacExtendedC19 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21 |
-		bucketg1JacExtendedC22 |
-		bucketg1JacExtendedC23
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Affine) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Affine, error) {
-	var _p G2Jac
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
-
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *G2Jac) MultiExpBatchAffine(points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) (*G2Jac, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
-	}
-
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
-	}
-
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
-		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C
-	}
-
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
-	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInnerG2JacBatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInnerG2JacBatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
-
-	msmInnerG2JacBatchAffine(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
-	return p, nil
-}
-
-func msmInnerG2JacBatchAffine(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
-
-	switch c {
-
-	case 1:
-		msmCG2Affine[bucketg2JacExtendedC1, bucketg2JacExtendedC1](p, 1, points, scalars, splitFirstChunk)
-
-	case 2:
-		msmCG2Affine[bucketg2JacExtendedC2, bucketg2JacExtendedC2](p, 2, points, scalars, splitFirstChunk)
-
-	case 3:
-		msmCG2Affine[bucketg2JacExtendedC3, bucketg2JacExtendedC3](p, 3, points, scalars, splitFirstChunk)
-
-	case 4:
-		msmCG2Affine[bucketg2JacExtendedC4, bucketg2JacExtendedC4](p, 4, points, scalars, splitFirstChunk)
-
-	case 5:
-		msmCG2Affine[bucketg2JacExtendedC5, bucketg2JacExtendedC4](p, 5, points, scalars, splitFirstChunk)
-
-	case 6:
-		msmCG2Affine[bucketg2JacExtendedC6, bucketg2JacExtendedC6](p, 6, points, scalars, splitFirstChunk)
-
-	case 7:
-		msmCG2Affine[bucketg2JacExtendedC7, bucketg2JacExtendedC6](p, 7, points, scalars, splitFirstChunk)
-
-	case 8:
-		msmCG2Affine[bucketg2JacExtendedC8, bucketg2JacExtendedC8](p, 8, points, scalars, splitFirstChunk)
-
-	case 9:
-		msmCG2Affine[bucketg2JacExtendedC9, bucketg2JacExtendedC6](p, 9, points, scalars, splitFirstChunk)
-
-	case 10:
-		batchG2AffineMsm[bucketG2AffineC10, bucketg2JacExtendedC4](p, 10, points, scalars, splitFirstChunk)
-
-	case 11:
-		batchG2AffineMsm[bucketG2AffineC11, bucketg2JacExtendedC10](p, 11, points, scalars, splitFirstChunk)
-
-	case 12:
-		batchG2AffineMsm[bucketG2AffineC12, bucketg2JacExtendedC12](p, 12, points, scalars, splitFirstChunk)
-
-	case 13:
-		batchG2AffineMsm[bucketG2AffineC13, bucketg2JacExtendedC7](p, 13, points, scalars, splitFirstChunk)
-
-	case 14:
-		batchG2AffineMsm[bucketG2AffineC14, bucketg2JacExtendedC6](p, 14, points, scalars, splitFirstChunk)
-
-	case 15:
-		batchG2AffineMsm[bucketG2AffineC15, bucketg2JacExtendedC9](p, 15, points, scalars, splitFirstChunk)
-
-	case 16:
-		batchG2AffineMsm[bucketG2AffineC16, bucketg2JacExtendedC16](p, 16, points, scalars, splitFirstChunk)
-
-	case 17:
-		batchG2AffineMsm[bucketG2AffineC17, bucketg2JacExtendedC10](p, 17, points, scalars, splitFirstChunk)
-
-	case 18:
-		batchG2AffineMsm[bucketG2AffineC18, bucketg2JacExtendedC6](p, 18, points, scalars, splitFirstChunk)
-
-	case 19:
-		batchG2AffineMsm[bucketG2AffineC19, bucketg2JacExtendedC4](p, 19, points, scalars, splitFirstChunk)
-
-	case 20:
-		batchG2AffineMsm[bucketG2AffineC20, bucketg2JacExtendedC4](p, 20, points, scalars, splitFirstChunk)
-
-	case 21:
-		batchG2AffineMsm[bucketG2AffineC21, bucketg2JacExtendedC6](p, 21, points, scalars, splitFirstChunk)
-
-	case 22:
-		batchG2AffineMsm[bucketG2AffineC22, bucketg2JacExtendedC10](p, 22, points, scalars, splitFirstChunk)
-
-	case 23:
-		batchG2AffineMsm[bucketG2AffineC23, bucketg2JacExtendedC16](p, 23, points, scalars, splitFirstChunk)
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
+type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
+type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
+type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
-	default:
-		panic("not implemented")
-	}
+type ibG2Affine interface {
+	bucketG2AffineC4 |
+		bucketG2AffineC5 |
+		bucketG2AffineC8 |
+		bucketG2AffineC16
 }
 
 type BatchG2Affine[B ibG2Affine] struct {
@@ -909,253 +471,3 @@ func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]
 	return queue
 
 }
-
-func msmProcessChunkG2AffineBatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total g2JacExtended
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-func batchG2AffineMsm[B ibG2Affine, J ibg2JacExtended](p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) *G2Jac {
-
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here.
-		go func(j uint64, points []G2Affine, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]g2JacExtended, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunkG2Affine[J](j, chChunks[j], c, points, scalars)
-		}(uint64(nbChunks-1), points, scalars)
-		nbChunks--
-	}
-
-	processChunk := func(j int, points []G2Affine, scalars []fr.Element, chChunk chan g2JacExtended) {
-		msmProcessChunkG2AffineBatchAffine[B](uint64(j), chChunk, c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j > 0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan g2JacExtended, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
-type bucketG2AffineC1 [1 << (1 - 1)]G2Affine
-type bucketG2AffineC2 [1 << (2 - 1)]G2Affine
-type bucketG2AffineC3 [1 << (3 - 1)]G2Affine
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC17 [1 << (17 - 1)]G2Affine
-type bucketG2AffineC18 [1 << (18 - 1)]G2Affine
-type bucketG2AffineC19 [1 << (19 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
-type bucketG2AffineC22 [1 << (22 - 1)]G2Affine
-type bucketG2AffineC23 [1 << (23 - 1)]G2Affine
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC17 [1 << (17 - 1)]g2JacExtended
-type bucketg2JacExtendedC18 [1 << (18 - 1)]g2JacExtended
-type bucketg2JacExtendedC19 [1 << (19 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
-type bucketg2JacExtendedC22 [1 << (22 - 1)]g2JacExtended
-type bucketg2JacExtendedC23 [1 << (23 - 1)]g2JacExtended
-
-type ibG2Affine interface {
-	bucketG2AffineC1 |
-		bucketG2AffineC2 |
-		bucketG2AffineC3 |
-		bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
-		bucketG2AffineC11 |
-		bucketG2AffineC12 |
-		bucketG2AffineC13 |
-		bucketG2AffineC14 |
-		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC17 |
-		bucketG2AffineC18 |
-		bucketG2AffineC19 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21 |
-		bucketG2AffineC22 |
-		bucketG2AffineC23
-}
-
-type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC4 |
-		bucketg2JacExtendedC5 |
-		bucketg2JacExtendedC6 |
-		bucketg2JacExtendedC7 |
-		bucketg2JacExtendedC8 |
-		bucketg2JacExtendedC9 |
-		bucketg2JacExtendedC10 |
-		bucketg2JacExtendedC11 |
-		bucketg2JacExtendedC12 |
-		bucketg2JacExtendedC13 |
-		bucketg2JacExtendedC14 |
-		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC17 |
-		bucketg2JacExtendedC18 |
-		bucketg2JacExtendedC19 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21 |
-		bucketg2JacExtendedC22 |
-		bucketg2JacExtendedC23
-}
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
new file mode 100644
index 0000000000..376ee3df28
--- /dev/null
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -0,0 +1,177 @@
+// Copyright 2020 ConsenSys Software Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by consensys/gnark-crypto DO NOT EDIT
+
+package bw6761
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
+)
+
+func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
+	chRes chan<- g1JacExtended,
+	c uint64,
+	points []G1Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g1JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
+type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
+type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+
+type ibg1JacExtended interface {
+	bucketg1JacExtendedC4 |
+		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC16
+}
+
+func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	scalars []fr.Element) {
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
+
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
+	}
+
+	// for each scalars, get the digit corresponding to the chunk we're processing.
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+		}
+
+		if bits == 0 {
+			continue
+		}
+
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			buckets[bits-1].addMixed(&points[i])
+		} else {
+			// sub
+			buckets[bits & ^msbWindow].subMixed(&points[i])
+		}
+	}
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total g2JacExtended
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].ZZ.IsZero() {
+			runningSum.add(&buckets[k])
+		}
+		total.add(&runningSum)
+	}
+
+	chRes <- total
+
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
+type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
+type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+
+type ibg2JacExtended interface {
+	bucketg2JacExtendedC4 |
+		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC16
+}
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 8d851d2d42..fa82870fa4 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG1Affine[bucketg1JacExtendedC16, bucketg1JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}
+	cRange := []uint64{4, 5, 8, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -130,10 +130,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -171,10 +171,10 @@ func TestMultiExpG1(t *testing.T) {
 			results := make([]G1Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG1Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -209,8 +209,8 @@ func TestMultiExpG1(t *testing.T) {
 			var result1, result2 G1Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG1Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG1JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -288,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmCG2Affine[bucketg2JacExtendedC16, bucketg2JacExtendedC16](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -461,10 +461,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -502,10 +502,10 @@ func TestMultiExpG2(t *testing.T) {
 			results := make([]G2Jac, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInnerG2Jac(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -540,8 +540,8 @@ func TestMultiExpG2(t *testing.T) {
 			var result1, result2 G2Jac
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInnerG2Jac(&result1, int(c), samplePoints[:], scalars, false)
-				msmInnerG2JacBatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
 				if !result1.Equal(&result2) {
 					return false
 				}
@@ -619,7 +619,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
 	}
diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go
index 0d387a7cf2..e1df940957 100644
--- a/internal/generator/config/curve.go
+++ b/internal/generator/config/curve.go
@@ -51,16 +51,16 @@ func (c Curve) Equal(other Curve) bool {
 }
 
 type Point struct {
-	CoordType        string
-	CoordExtDegree   uint8 // value n, such that q = pⁿ
-	CoordExtRoot     int64 // value a, such that the field is Fp[X]/(Xⁿ - a)
-	PointName        string
-	GLV              bool     // scalar multiplication using GLV
-	CofactorCleaning bool     // flag telling if the Cofactor cleaning is available
-	CRange           []int    // multiexp bucket method: generate inner methods (with const arrays) for each c
-	Projective       bool     // generate projective coordinates
-	A                []string //A linear coefficient in Weierstrass form
-	B                []string //B constant term in Weierstrass form
+	CoordType          string
+	CoordExtDegree     uint8 // value n, such that q = pⁿ
+	CoordExtRoot       int64 // value a, such that the field is Fp[X]/(Xⁿ - a)
+	PointName          string
+	GLV                bool     // scalar multiplication using GLV
+	CofactorCleaning   bool     // flag telling if the Cofactor cleaning is available
+	CRange, LastCRange []int    // multiexp bucket method: generate inner methods (with const arrays) for each c
+	Projective         bool     // generate projective coordinates
+	A                  []string //A linear coefficient in Weierstrass form
+	B                  []string //B constant term in Weierstrass form
 }
 
 var Curves []Curve
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 6af6b7d54a..a4b3e9b5fd 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -17,6 +17,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 	entries := []bavard.Entry{
 		{File: filepath.Join(baseDir, "multiexp.go"), Templates: []string{"multiexp.go.tmpl"}},
 		{File: filepath.Join(baseDir, "multiexp_affine.go"), Templates: []string{"multiexp_affine.go.tmpl"}},
+		{File: filepath.Join(baseDir, "multiexp_jacobian.go"), Templates: []string{"multiexp_jacobian.go.tmpl"}},
 		{File: filepath.Join(baseDir, "multiexp_test.go"), Templates: []string{"tests/multiexp.go.tmpl"}},
 		{File: filepath.Join(baseDir, "marshal.go"), Templates: []string{"marshal.go.tmpl"}},
 		{File: filepath.Join(baseDir, "marshal_test.go"), Templates: []string{"tests/marshal.go.tmpl"}},
@@ -26,7 +27,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 	funcs["last"] = func(x int, a interface{}) bool {
 		return x == reflect.ValueOf(a).Len()-1
 	}
-	funcs["lastC"] = func(c int) int {
+	lastC := func(c int) int {
 		// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
 		// if c divides fr.Limbs * 64;
 		n := (conf.Fr.NbWords * 64)
@@ -35,6 +36,8 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		}
 		return n - (c * (n / c))
 	}
+	funcs["lastC"] = lastC
+
 	funcs["contains"] = func(v int, s []int) bool {
 		for _, sv := range s {
 			if v == sv {
@@ -43,12 +46,17 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		}
 		return false
 	}
-	// TODO @gbotrel fix me. need to generate usual C, and missing lastC for bucket size.
-	conf.G1.CRange = make([]int, 23)
-	conf.G2.CRange = make([]int, 23)
 	for i := 0; i < len(conf.G1.CRange); i++ {
-		conf.G1.CRange[i] = i + 1
-		conf.G2.CRange[i] = i + 1
+		lc := lastC(conf.G1.CRange[i])
+		if !contains(conf.G1.CRange, lc) && !contains(conf.G1.LastCRange, lc) {
+			conf.G1.LastCRange = append(conf.G1.LastCRange, lc)
+		}
+	}
+	for i := 0; i < len(conf.G2.CRange); i++ {
+		lc := lastC(conf.G2.CRange[i])
+		if !contains(conf.G2.CRange, lc) && !contains(conf.G2.LastCRange, lc) {
+			conf.G2.LastCRange = append(conf.G2.LastCRange, lc)
+		}
 	}
 	bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)}
 	if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil {
@@ -105,3 +113,12 @@ type pconf struct {
 	config.Curve
 	config.Point
 }
+
+func contains(slice []int, v int) bool {
+	for i := 0; i < len(slice); i++ {
+		if slice[i] == v {
+			return true
+		}
+	}
+	return false
+}
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 62b5f03f62..9909d8055b 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -16,6 +16,10 @@ import (
 	"runtime"
 )
 
+{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
+
+
 // selector stores the index, mask and shifts needed to select bits from a scalar
 // it is used during the multiExp algorithm or the batch scalar multiplication
 type selector struct {
@@ -157,8 +161,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 }
 
 
-{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
-{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
 
 
 {{define "multiexp" }}
@@ -270,136 +272,68 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInner{{ $.TJacobian }} , but that would incur a cost of looping through all scalars one more time
+	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	// we have nbSplits intermediate results that we must sum together. 
+	// we have nbSplits intermediate results that we must sum together.
 	_p := make([]{{ $.TJacobian }}, nbSplits - 1)
 	chDone := make(chan int, nbSplits - 1)
 	for i:=0; i < nbSplits-1; i++ {
 		start := i * nbPoints
-		end := start + nbPoints 
+		end := start + nbPoints
 		go func(start, end, i int) {
-			msmInner{{ $.TJacobian }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+			innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
 			chDone <- i
 		}(start, end, i)
 	}
-	
-	msmInner{{ $.TJacobian }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+
+	innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
 	for i:=0; i < nbSplits-1; i++ {
 		done := <-chDone
 		p.AddAssign(&_p[done])
 	}
 	close(chDone)
-	return p, nil 
+	return p, nil
 }
 
-func msmInner{{ $.TJacobian }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
 
+func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
 	switch c {
 	{{range $c :=  $.CRange}}
+	{{- $lc := lastC $c}}
 	case {{$c}}:
-		msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
-	{{end}}
+		{{- if le $c 9}}
+			processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+		{{- else}}
+			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}]
+		{{- end}}
+		{{- if eq $c $lc}}
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processChunk)
+		{{- else}}
+			{{- if le $lc 9}}
+				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
+			{{- else}}
+				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}]
+			{{- end}}
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		{{- end}}
+	{{- end}}
 	default:
 		panic("not implemented")
 	}
 }
 
-// msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp
-func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }})  *{{ $.TJacobian }} {
-	var _p {{ $.TJacobianExtended }}
-	totalj := <-chChunks[len(chChunks)-1]
-    _p.Set(&totalj)
-	for j := len(chChunks) - 2; j >= 0; j-- {
-		for l := 0; l < c; l++ {
-			_p.double(&_p)
-		}
-		totalj := <-chChunks[j]
-		_p.add(&totalj)
-	}
-
-	return p.unsafeFromJacExtended(&_p)
-}
-
-
-func msmProcessChunk{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}](chunk uint64,
-	 chRes chan<- {{ $.TJacobianExtended }},
-	 c uint64,
-	 points []{{ $.TAffine }},
-	 scalars []fr.Element) {
-
-
-	mask  := uint64((1 << c) - 1)	// low c bits are 1
-	msbWindow  := uint64(1 << (c -1))
-
-	var buckets B
-	for i := 0 ; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-
-	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		// if msbWindow bit is set, we need to substract
-		if bits & msbWindow == 0 {
-			// add
-			buckets[bits-1].addMixed(&points[i])
-		} else {
-			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
-		}
-	}
-
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total {{ $.TJacobianExtended }}
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].ZZ.IsZero() {
-			runningSum.add(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-	{{/* close(chRes) */}}
-}
-
-
-
-func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element)) *{{ $.TJacobian }} {
+	
 	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the 
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is 
+	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
 	// critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
@@ -408,48 +342,58 @@ func msmC{{ $.TAffine }}[B ib{{ $.TJacobianExtended }}, LB ib{{ $.TJacobianExten
 		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
 	}
 
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
 
+	for j := int(nbChunks - 2); j >0; j-- {
+		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+	}
 
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here. 
-		go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1))
-			// TODO @gbotrel last C restore.
-			msmProcessChunk{{ $.TAffine }}[LB](j, chChunks[j],  c, points, scalars)
-		}(uint64(nbChunks - 1), points, scalars)
-		nbChunks--
+	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
+	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
+	// in the ~same amount of time
+	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		if !splitFirstChunk {
+			go processChunk(0,chChunks[0], c, points, scalars)
+		} else {
+			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
+			split := len(points) / 2
+			go processChunk(0,chSplit, c, points[:split], scalars[:split])
+			go processChunk(0,chSplit, c, points[split:], scalars[split:])
+			go func() {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[0] <- s1
+			}()
+		}
+	
 	}
+	
+	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
+}
 
 
-	processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) {
-		msmProcessChunk{{ $.TAffine }}[B](uint64(j), chChunk,  c, points, scalars)
-	}
 
-	for j := int(nbChunks - 1); j >0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
+// msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp
+func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }})  *{{ $.TJacobian }} {
+	var _p {{ $.TJacobianExtended }}
+	totalj := <-chChunks[len(chChunks)-1]
+    _p.Set(&totalj)
+	for j := len(chChunks) - 2; j >= 0; j-- {
+		for l := 0; l < c; l++ {
+			_p.double(&_p)
+		}
+		totalj := <-chChunks[j]
+		_p.add(&totalj)
 	}
 
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan {{ $.TJacobianExtended }}, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
+	return p.unsafeFromJacExtended(&_p)
+}
+
+
 
 
-	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
-}
 
 {{end }}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 3a803280f3..0c0fba41e5 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -8,10 +8,6 @@
 
 import (
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
-	"github.com/consensys/gnark-crypto/ecc"
-	"errors"
-	"math"
-	"runtime"
 )
 
 const MAX_BATCH_SIZE = 600
@@ -26,158 +22,124 @@ func (o batchOp) isNeg() bool {
 
 
 
-{{ template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
-{{ template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
+{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
 
 
 
 {{define "multiexp" }}
 
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *{{ $.TAffine }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) {
-	var _p {{$.TJacobian}}
-	if _, err := _p.MultiExpBatchAffine(points, scalars, config); err != nil {
-		return nil, err
-	}
-	p.FromJacobian(&_p)
-	return p, nil
-}
+// processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+// 
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64,
+	 chRes chan<- {{ $.TJacobianExtended }},
+	 c uint64,
+	 points []{{ $.TAffine }},
+	 scalars []fr.Element) {
 
-// MultiExpBatchAffine implements section 4 of https://eprint.iacr.org/2012/549.pdf
-//
-// This call return an error if len(scalars) != len(points) or if provided config is invalid.
-func (p *{{ $.TJacobian }}) MultiExpBatchAffine(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) {
-	// note:
-	// each of the batchAffineMsmCX method is the same, except for the c constant it declares
-	// duplicating (through template generation) these methods allows to declare the buckets on the stack
-	// the choice of c needs to be improved:
-	// there is a theoritical value that gives optimal asymptotics
-	// but in practice, other factors come into play, including:
-	// * if c doesn't divide 64, the word size, then we're bound to select bits over 2 words of our scalars, instead of 1
-	// * number of CPUs
-	// * cache friendliness (which depends on the host, G1 or G2... )
-	//	--> for example, on BN254, a G1 point fits into one cache line of 64bytes, but a G2 point don't.
-
-	// for each batchAffineMsmCX
-	// step 1
-	// we compute, for each scalars over c-bit wide windows, nbChunk digits
-	// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
-	// 2^{c} to the current digit, making it negative.
-	// negative digits will be processed in the next step as adding -G into the bucket instead of G
-	// (computing -G is cheap, and this saves us half of the buckets)
-	// step 2
-	// buckets are declared on the stack
-	// notice that we have 2^{c-1} buckets instead of 2^{c} (see step1)
-	// we use jacobian extended formulas here as they are faster than mixed addition
-	// msmProcessChunk places points into buckets base on their selector and return the weighted bucket sum in given channel
-	// step 3
-	// reduce the buckets weigthed sums into our result (msmReduceChunk)
-
-	// ensure len(points) == len(scalars)
-	nbPoints := len(points)
-	if nbPoints != len(scalars) {
-		return nil, errors.New("len(points) != len(scalars)")
+	mask  := uint64((1 << c) - 1)	// low c bits are 1
+	msbWindow := uint64(1 << (c - 1))
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
 	}
 
-	// if nbTasks is not set, use all available CPUs
-	if config.NbTasks <= 0 {
-		config.NbTasks = runtime.NumCPU()
-	} else if config.NbTasks > 1024 {
-		return nil, errors.New("invalid config: config.NbTasks > 1024")
+	jc := uint64(chunk * c)
+	s := selector{}
+	s.index = jc / 64
+	s.shift = jc - (s.index * 64)
+	s.mask = mask << s.shift
+	s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
+	if s.multiWordSelect {
+		nbBitsHigh := s.shift - uint64(64-c)
+		s.maskHigh = (1 << nbBitsHigh) - 1
+		s.shiftHigh = (c - nbBitsHigh)
 	}
 
-	// here, we compute the best C for nbPoints
-	// we split recursively until nbChunks(c) >= nbTasks,
-	bestC := func(nbPoints int) uint64 {
-		// implemented batchAffineMsmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{
-			{{- range $c :=  $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}}
+	batch := newBatch{{ $.TAffine }}(&buckets, points)
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	nbBatches := 0
+	for i := 0; i < len(scalars); i++ {
+		bits := (scalars[i][s.index] & s.mask) >> s.shift
+		if s.multiWordSelect {
+			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
 		}
-		var C uint64
-		// approximate cost (in group operations)
-		// cost = bits/c * (nbPoints + 2^{c})
-		// this needs to be verified empirically.
-		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
-		min := math.MaxFloat64
-		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
-			cost := float64(cc) / float64(c)
-			if cost < min {
-				min = cost
-				C = c
-			}
+
+		if bits == 0 {
+			continue
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
-		return C 
-	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C)  // number of c-bit radixes in a scalar
-		if (fr.Limbs * 64) % C != 0 {
-			nbChunks ++
+		op := batchOp{pointID: uint32(i) << 1}
+		// if msbWindow bit is set, we need to substract
+		if bits&msbWindow == 0 {
+			// add
+			op.bucketID = uint32(bits - 1)
+			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+		} else {
+			// sub
+			op.bucketID = (uint32(bits & ^msbWindow))
+			op.pointID += 1
+			// op.isNeg = true
+			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
+		if batch.CanAdd(op.bucketID) {
+			batch.Add(op)
+			if batch.IsFull() {
+				batch.ExecuteAndReset()
+				nbBatches++
+				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
+					batch.Add(queue[len(queue)-1])
+					queue = queue[:len(queue)-1]
+				}
+			}
+		} else {
+			// put it in queue.
+			queue = append(queue, op)
 		}
 	}
-
-	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in msmInner{{ $.TJacobian }}BatchAffine , but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]{{ $.TJacobian }}, nbSplits - 1)
-	chDone := make(chan int, nbSplits - 1)
-	for i:=0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			msmInner{{ $.TJacobian }}BatchAffine(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
+	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// batch.ExecuteAndReset()
+	for len(queue) != 0 {
+		queue = processQueue{{ $.TAffine }}(queue, &batch)
+		batch.ExecuteAndReset() // execute batch even if not full.
 	}
 
-	msmInner{{ $.TJacobian }}BatchAffine(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	for i:=0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
+	// flush items in batch.
+	batch.ExecuteAndReset()
+
+	// reduce buckets into total
+	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+	var runningSum, total {{ $.TJacobianExtended }}
+	runningSum.setInfinity()
+	total.setInfinity()
+	for k := len(buckets) - 1; k >= 0; k-- {
+		if !buckets[k].IsInfinity() {
+			runningSum.addMixed(&buckets[k])
+		}
+		total.add(&runningSum)
 	}
-	close(chDone)
-	return p, nil
+
+	chRes <- total
+
 }
 
-func msmInner{{ $.TJacobian }}BatchAffine(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
-
-	switch c {
-	{{range $c :=  $.CRange}}
-	case {{$c}}:
-		{{- if le $c 9}}
-		msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
-		{{- else}}
-		batch{{ $.TAffine }}Msm[bucket{{ $.TAffine }}C{{$c}}, bucket{{ $.TJacobianExtended }}C{{lastC $c}}](p, {{$c}}, points, scalars, splitFirstChunk)
-		{{- end}}
-	{{end}}
-	default:
-		panic("not implemented")
-	}
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+{{- range $c :=  $.CRange}}
+type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }}
+{{- end}}
+
+type ib{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
 }
 
 
@@ -288,180 +250,4 @@ func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Ba
 
 }
 
-func msmProcessChunk{{ $.TAffine }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64,
-	 chRes chan<- {{ $.TJacobianExtended }},
-	 c uint64,
-	 points []{{ $.TAffine }},
-	 scalars []fr.Element) {
-
-	mask  := uint64((1 << c) - 1)	// low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
-	}
-
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
-	batch := newBatch{{ $.TAffine }}(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
-	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
-
-		if bits == 0 {
-			continue
-		}
-
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
-			// add
-			op.bucketID = uint32(bits - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
-		} else {
-			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
-			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
-		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
-					batch.Add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-			}
-		} else {
-			// put it in queue.
-			queue = append(queue, op)
-		}
-	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
-	for len(queue) != 0 {
-		queue = processQueue{{ $.TAffine }}(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
-	}
-
-	// flush items in batch.
-	batch.ExecuteAndReset()
-
-	// reduce buckets into total
-	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
-	var runningSum, total {{ $.TJacobianExtended }}
-	runningSum.setInfinity()
-	total.setInfinity()
-	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
-			runningSum.addMixed(&buckets[k])
-		}
-		total.add(&runningSum)
-	}
-
-	chRes <- total
-
-}
-
-
-
-func batch{{ $.TAffine }}Msm[B ib{{ $.TAffine }}, J ib{{ $.TJacobianExtended }}](p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool) *{{ $.TJacobian }} {
-	
-	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
-		nbChunks++
-	}
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks)
-	for i:=0; i < len(chChunks);i++ {
-		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
-	}
-
-	if (fr.Limbs*64)%c != 0 {
-		// TODO @gbotrel not always needed to do ext jac here. 
-		go func(j uint64, points []{{ $.TAffine }}, scalars []fr.Element) {
-			// var buckets LB
-			// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-			// buckets := make([]{{ $.TJacobianExtended }}, 1<<(lastC-1))
-			// TODO @gbotrel lastC restore.
-			msmProcessChunk{{ $.TAffine }}[J](j, chChunks[j],  c, points, scalars)
-		}(uint64(nbChunks - 1), points, scalars)
-		nbChunks--
-	}
-
-
-	processChunk := func(j int, points []{{ $.TAffine }}, scalars []fr.Element, chChunk chan {{ $.TJacobianExtended }}) {
-		msmProcessChunk{{ $.TAffine }}BatchAffine[B](uint64(j), chChunk,  c, points, scalars)
-	}
-
-	for j := int(nbChunks - 1); j >0; j-- {
-		go processChunk(j, points, scalars, chChunks[j])
-	}
-
-	if !splitFirstChunk {
-		go processChunk(0, points, scalars, chChunks[0])
-	} else {
-		chSplit := make(chan {{ $.TJacobianExtended }}, 2)
-		split := len(points) / 2
-		go processChunk(0, points[:split], scalars[:split], chSplit)
-		go processChunk(0, points[split:], scalars[split:], chSplit)
-		go func() {
-			s1 := <-chSplit
-			s2 := <-chSplit
-			close(chSplit)
-			s1.add(&s2)
-			chChunks[0] <- s1
-		}()
-	}
-
-	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
-}
-
-
-
-
-{{- range $c :=  $.CRange}}
-type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }}
-{{- end}}
-{{- range $c :=  $.CRange}}
-type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
-{{- end}}
-
-type ib{{ $.TAffine }} interface {
-	{{- range $i, $c :=  $.CRange}}
-	bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
-	{{- end}}
-}
-
-type ib{{ $.TJacobianExtended }} interface {
-	{{- range $i, $c :=  $.CRange}}
-	bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
-	{{- end}}
-}
-
 {{end }}
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
new file mode 100644
index 0000000000..72217dba82
--- /dev/null
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -0,0 +1,106 @@
+{{ $G1TAffine := print (toUpper .G1.PointName) "Affine" }}
+{{ $G1TJacobian := print (toUpper .G1.PointName) "Jac" }}
+{{ $G1TJacobianExtended := print (toLower .G1.PointName) "JacExtended" }}
+
+{{ $G2TAffine := print (toUpper .G2.PointName) "Affine" }}
+{{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }}
+{{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
+
+import (
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
+)
+
+
+{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
+{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
+
+
+
+{{define "multiexp" }}
+
+func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk uint64,
+	chRes chan<- {{ $.TJacobianExtended }},
+	c uint64,
+	points []{{ $.TAffine }},
+	scalars []fr.Element) {
+
+
+   mask  := uint64((1 << c) - 1)	// low c bits are 1
+   msbWindow  := uint64(1 << (c -1))
+
+   var buckets B
+   for i := 0 ; i < len(buckets); i++ {
+	   buckets[i].setInfinity()
+   }
+
+   jc := uint64(chunk * c)
+   s := selector{}
+   s.index = jc / 64
+   s.shift = jc - (s.index * 64)
+   s.mask = mask << s.shift
+   s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
+   if s.multiWordSelect {
+	   nbBitsHigh := s.shift - uint64(64-c)
+	   s.maskHigh = (1 << nbBitsHigh) - 1
+	   s.shiftHigh = (c - nbBitsHigh)
+   }
+
+
+   // for each scalars, get the digit corresponding to the chunk we're processing.
+   for i := 0; i < len(scalars); i++ {
+	   bits := (scalars[i][s.index] & s.mask) >> s.shift
+	   if s.multiWordSelect {
+		   bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
+	   }
+
+	   if bits == 0 {
+		   continue
+	   }
+
+	   // if msbWindow bit is set, we need to substract
+	   if bits & msbWindow == 0 {
+		   // add
+		   buckets[bits-1].addMixed(&points[i])
+	   } else {
+		   // sub
+		   buckets[bits & ^msbWindow].subMixed(&points[i])
+	   }
+   }
+
+
+   // reduce buckets into total
+   // total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
+
+   var runningSum, total {{ $.TJacobianExtended }}
+   runningSum.setInfinity()
+   total.setInfinity()
+   for k := len(buckets) - 1; k >= 0; k-- {
+	   if !buckets[k].ZZ.IsZero() {
+		   runningSum.add(&buckets[k])
+	   }
+	   total.add(&runningSum)
+   }
+
+   chRes <- total
+   {{/* close(chRes) */}}
+}
+
+// we declare the buckets as fixed-size array types
+// this allow us to allocate the buckets on the stack
+{{- range $c :=  $.CRange}}
+type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
+{{- end}}
+{{- range $c :=  $.LastCRange}}
+type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
+{{- end}}
+
+type ib{{ $.TJacobianExtended }} interface {
+	{{- range $i, $c :=  $.LastCRange}}
+	bucket{{ $.TJacobianExtended }}C{{$c}} |
+	{{- end}}
+	{{- range $i, $c :=  $.CRange}}
+	bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+}
+
+{{end }}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 93e26a09d6..ef51368fbc 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			msmC{{ $.TAffine }}[bucket{{ $.TJacobianExtended }}C16, bucket{{ $.TJacobianExtended }}C{{lastC 16}}](&r16, 16, samplePoints[:], scalars16, true)
+			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePoints[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -138,10 +138,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			results := make([]{{ $.TJacobian }}, len(cRange) + 1)
 			for i, c := range cRange {
 				scalars, _ :=  partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInner{{ $.TJacobian }}(&results[i], int(c), samplePoints[:], scalars, false)
+				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInner{{ $.TJacobian }}(&results[len(results)-1], 16, samplePoints[:], scalars, true)
+					innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePoints[:], scalars, true)
 				}
 			}
 			for i:=1; i < len(results);i++ {
@@ -179,10 +179,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			results := make([]{{ $.TJacobian }}, len(cRange)+1)
 			for i, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInner{{ $.TJacobian }}(&results[i], int(c), samplePointsZero[:], scalars, false)
+				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], scalars, false)
 				if c == 16 {
 					// split the first chunk
-					msmInner{{ $.TJacobian }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
+					innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
 				}
 			}
 			for i := 1; i < len(results); i++ {
@@ -218,8 +218,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
             var result1, result2 {{ $.TJacobian }}
 			for _, c := range cRange {
 				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				msmInner{{ $.TJacobian }}(&result1, int(c), samplePoints[:], scalars, false)
-				msmInner{{ $.TJacobian }}BatchAffine(&result2, int(c), samplePoints[:], scalars, false)
+				innerMsm{{ toUpper $.PointName }}(&result1, int(c), samplePoints[:], scalars, false)
+				innerMsm{{ toUpper $.PointName }}(&result2, int(c), samplePoints[:], scalars, false)
                 if !result1.Equal(&result2) {
 					return false
 				}
@@ -300,7 +300,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
-				testPoint.MultiExpBatchAffine(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
+				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
 			}
 		})
 	}

From 16352ccdef68a4528ba531448e48da2891662c69 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 8 Nov 2022 14:42:08 -0600
Subject: [PATCH 05/43] docs: added a todo in tmpl

---
 ecc/bls12-377/multiexp_jacobian.go                        | 2 --
 ecc/bls12-378/multiexp_jacobian.go                        | 2 --
 ecc/bls12-381/multiexp_jacobian.go                        | 2 --
 ecc/bls24-315/multiexp_jacobian.go                        | 2 --
 ecc/bls24-317/multiexp_jacobian.go                        | 2 --
 ecc/bn254/multiexp_jacobian.go                            | 2 --
 ecc/bw6-633/multiexp_jacobian.go                          | 2 --
 ecc/bw6-756/multiexp_jacobian.go                          | 2 --
 ecc/bw6-761/multiexp_jacobian.go                          | 2 --
 internal/generator/ecc/template/multiexp.go.tmpl          | 2 ++
 internal/generator/ecc/template/multiexp_jacobian.go.tmpl | 1 -
 11 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index fc89ebd2cc..dc787cb52e 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index a26fe93845..33efbc4286 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index a4e61348b7..b62f2d6012 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 4399395829..f4a2abe4b6 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index d948e2c697..b928244181 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 4939af44c8..5434d0e1aa 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -185,7 +184,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index f331d07491..8ed7343862 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 9dc8862130..984264456b 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 376ee3df28..6e2acf4b41 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -81,7 +81,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
@@ -159,7 +158,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	chRes <- total
-
 }
 
 // we declare the buckets as fixed-size array types
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 9909d8055b..6b5b935d81 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -298,6 +298,8 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 
 
 func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
+	{{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}}
+	{{- /* also need to determine until which window size the ext-jacobian version is worth it. */}}
 	switch c {
 	{{range $c :=  $.CRange}}
 	{{- $lc := lastC $c}}
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index 72217dba82..d4e00fa442 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -82,7 +82,6 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
    }
 
    chRes <- total
-   {{/* close(chRes) */}}
 }
 
 // we declare the buckets as fixed-size array types

From 95e4305ce5c393fd05489e0f022ed2947d42b980 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 8 Nov 2022 15:18:41 -0600
Subject: [PATCH 06/43] feat: partitionScalars return list of digits unpacked

---
 ecc/bls12-377/g1.go                           |   2 +-
 ecc/bls12-377/g2.go                           |   2 +-
 ecc/bls12-377/multiexp.go                     | 343 ++++++++++++------
 ecc/bls12-377/multiexp_affine.go              |  62 +---
 ecc/bls12-377/multiexp_jacobian.go            |  64 +---
 ecc/bls12-377/multiexp_test.go                |   4 +-
 ecc/bls12-378/g1.go                           |   2 +-
 ecc/bls12-378/g2.go                           |   2 +-
 ecc/bls12-378/multiexp.go                     | 343 ++++++++++++------
 ecc/bls12-378/multiexp_affine.go              |  62 +---
 ecc/bls12-378/multiexp_jacobian.go            |  64 +---
 ecc/bls12-378/multiexp_test.go                |   4 +-
 ecc/bls12-381/g1.go                           |   2 +-
 ecc/bls12-381/g2.go                           |   2 +-
 ecc/bls12-381/multiexp.go                     | 343 ++++++++++++------
 ecc/bls12-381/multiexp_affine.go              |  62 +---
 ecc/bls12-381/multiexp_jacobian.go            |  64 +---
 ecc/bls12-381/multiexp_test.go                |   4 +-
 ecc/bls24-315/g1.go                           |   2 +-
 ecc/bls24-315/g2.go                           |   2 +-
 ecc/bls24-315/multiexp.go                     | 343 ++++++++++++------
 ecc/bls24-315/multiexp_affine.go              |  62 +---
 ecc/bls24-315/multiexp_jacobian.go            |  64 +---
 ecc/bls24-315/multiexp_test.go                |   4 +-
 ecc/bls24-317/g1.go                           |   2 +-
 ecc/bls24-317/g2.go                           |   2 +-
 ecc/bls24-317/multiexp.go                     | 343 ++++++++++++------
 ecc/bls24-317/multiexp_affine.go              |  62 +---
 ecc/bls24-317/multiexp_jacobian.go            |  64 +---
 ecc/bls24-317/multiexp_test.go                |   4 +-
 ecc/bn254/g1.go                               |   2 +-
 ecc/bn254/g2.go                               |   2 +-
 ecc/bn254/multiexp.go                         | 343 ++++++++++++------
 ecc/bn254/multiexp_affine.go                  |  62 +---
 ecc/bn254/multiexp_jacobian.go                |  64 +---
 ecc/bn254/multiexp_test.go                    |   4 +-
 ecc/bw6-633/g1.go                             |   2 +-
 ecc/bw6-633/g2.go                             |   2 +-
 ecc/bw6-633/multiexp.go                       | 299 ++++++++++-----
 ecc/bw6-633/multiexp_affine.go                |  62 +---
 ecc/bw6-633/multiexp_jacobian.go              |  64 +---
 ecc/bw6-633/multiexp_test.go                  |   4 +-
 ecc/bw6-756/g1.go                             |   2 +-
 ecc/bw6-756/g2.go                             |   2 +-
 ecc/bw6-756/multiexp.go                       | 299 ++++++++++-----
 ecc/bw6-756/multiexp_affine.go                |  62 +---
 ecc/bw6-756/multiexp_jacobian.go              |  64 +---
 ecc/bw6-756/multiexp_test.go                  |   4 +-
 ecc/bw6-761/g1.go                             |   2 +-
 ecc/bw6-761/g2.go                             |   2 +-
 ecc/bw6-761/multiexp.go                       | 299 ++++++++++-----
 ecc/bw6-761/multiexp_affine.go                |  62 +---
 ecc/bw6-761/multiexp_jacobian.go              |  64 +---
 ecc/bw6-761/multiexp_test.go                  |   4 +-
 .../generator/ecc/template/multiexp.go.tmpl   | 217 ++++++++---
 .../ecc/template/multiexp_affine.go.tmpl      |  32 +-
 .../ecc/template/multiexp_jacobian.go.tmpl    |  33 +-
 internal/generator/ecc/template/point.go.tmpl |   2 +-
 .../ecc/template/tests/multiexp.go.tmpl       |   2 +-
 59 files changed, 2421 insertions(+), 2026 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index bc9027480a..3b436a6b2b 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index fdf535ca82..18810fe510 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -914,7 +914,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 1673861355..4fad52e512 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index ac8d41cbb6..c3f89f7406 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bls12377
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index dc787cb52e..1981509ee0 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bls12377
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 7882874fda..fe84ecc91f 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index fd8fbe7ee0..5b9ec0f84f 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 479cda7053..0010b3983b 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -914,7 +914,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 862cca829b..200a5fc096 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 583761fe76..06c5f74bfe 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bls12378
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 33efbc4286..592070f11d 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bls12378
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 8a80c9d1f8..a94fc2e0b9 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index 189c5ac202..eccf0c9c97 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -915,7 +915,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 3473f3d002..5264766d99 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -915,7 +915,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index d926dc8e2e..50b716e3c4 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 36695009a0..3f0eeabba3 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bls12381
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index b62f2d6012..3840228907 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bls12381
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 15cd0f5304..8e356fde9d 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index 3209c210ad..173d24e902 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -917,7 +917,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 7f377b8147..e498978c0b 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -930,7 +930,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 3686207518..912d69ba44 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index f1cdcfe574..93b8a89cdb 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bls24315
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index f4a2abe4b6..be1b9a8b65 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bls24315
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 6a17d03fb4..352e8122b6 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index a7198ef2ea..9443125d34 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -917,7 +917,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 907c1db13b..0e2738e211 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -930,7 +930,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 2cc4feb7fd..f2c3d767a1 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 916d8beced..2f19119cc6 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bls24317
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index b928244181..a79434b537 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bls24317
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 7e39930e23..feb0b0c87f 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 4056491a53..0844716e0f 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -887,7 +887,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index deeb006578..23203fd92c 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -919,7 +919,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index e519895eb5..19b6fd8496 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -106,120 +106,113 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -238,10 +231,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -249,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -366,120 +360,113 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -498,10 +485,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -509,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -565,7 +553,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 8b10c9786f..0028bf14dd 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bn254
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -283,35 +262,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -319,13 +281,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 5434d0e1aa..2b0db816d2 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bn254
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -127,46 +105,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 7fbb203ce1..a9df7af3fa 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index f70d2b30cc..860ce3e355 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1019,7 +1019,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index f9284d2ec7..12579994ab 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -885,7 +885,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index ceb1ad7847..9c168de3c5 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -106,75 +106,68 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -193,10 +186,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -204,12 +198,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -321,75 +315,68 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -408,10 +395,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -419,12 +407,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -475,7 +463,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 221079f874..1cc44fcaa6 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bw6633
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index 8ed7343862..996352830f 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bw6633
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 0a8268cd6a..5fd9a64f32 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index 038e4f1b42..d53b7f5f82 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1019,7 +1019,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index cb9fadd15d..049841f4f7 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -879,7 +879,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index ee5ff35a9a..04a630a92a 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -106,76 +106,69 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -194,10 +187,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -205,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -322,76 +316,69 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -410,10 +397,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -421,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -477,7 +465,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 537221cb69..2a439173b9 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bw6756
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 984264456b..1f7ec4b3f8 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bw6756
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 0d0384701c..c8ddd47a9f 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 765d29433b..8694980eda 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1030,7 +1030,7 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index fdb98731d4..3198411f9e 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -893,7 +893,7 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index d9da35c23d..b0fb81e0e6 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -106,76 +106,69 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G1Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG1(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G1Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, scalars []fr.Element)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -194,10 +187,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -205,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -322,76 +316,69 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-		if (fr.Limbs*64)%C != 0 {
-			nbChunks++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]G2Jac, nbSplits-1)
-	chDone := make(chan int, nbSplits-1)
-	for i := 0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
 
-	innerMsmG2(p, int(C), points[(nbSplits-1)*nbPoints:], scalars[(nbSplits-1)*nbPoints:], splitFirstChunk)
-	for i := 0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]G2Jac, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, scalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, scalars []fr.Element)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -410,10 +397,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -421,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, sp
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, scalars)
+			go processChunk(0, chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
-			split := len(points) / 2
-			go processChunk(0, chSplit, c, points[:split], scalars[:split])
-			go processChunk(0, chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -477,7 +465,132 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs*64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask := uint64((1 << c) - 1) // low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c - 1))    // max value we want for our digits
+	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk := uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64 && d.shift > (64-c) && d.index < (fr.Limbs-1)
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks)
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i := start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				}
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues += o
+	}
+	return toReturn, smallValues
+}
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index f012110b80..6fec5d8f63 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -16,10 +16,6 @@
 
 package bw6761
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
-)
-
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -40,35 +36,18 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG1Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +55,13 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
@@ -261,35 +240,18 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatchG2Affine(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -297,13 +259,13 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits >> 1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits >> 1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 6e2acf4b41..48249ca28f 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -16,54 +16,32 @@
 
 package bw6761
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
-)
-
 func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -101,46 +79,28 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	scalars []fr.Element) {
-
-	mask := uint64((1 << c) - 1) // low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
+	pscalars []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64%c) != 0 && s.shift > (64-c) && s.index < (fr.Limbs-1)
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			buckets[bits-1].addMixed(&points[i])
+			buckets[(bits>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[bits & ^msbWindow].subMixed(&points[i])
+			buckets[(bits >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index fa82870fa4..cbcc319e1a 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -92,7 +92,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -425,7 +425,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 6b5b935d81..bd4a489361 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -40,7 +40,136 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+	// number of c-bit radixes in a scalar
+	nbChunks := fr.Limbs * 64 / c
+	if (fr.Limbs * 64)%c != 0 {
+		nbChunks++
+	}
+
+	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+
+	mask  := uint64((1 << c) - 1) 		// low c bits are 1
+	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
+	max := int(1 << (c -1)) 					// max value we want for our digits
+	cDivides64 :=  (64 %c ) == 0 				// if c doesn't divide 64, we may need to select over multiple words
+
+
+	// compute offset and word selector / shift to select the right bits of our windows
+	selectors := make([]selector, nbChunks)
+	for chunk:=uint64(0); chunk < nbChunks; chunk++ {
+		jc := uint64(chunk * c)
+		d := selector{}
+		d.index = jc / 64
+		d.shift = jc - (d.index * 64)
+		d.mask = mask << d.shift
+		d.multiWordSelect = !cDivides64  && d.shift > (64-c) && d.index < (fr.Limbs - 1 )
+		if d.multiWordSelect {
+			nbBitsHigh := d.shift - uint64(64-c)
+			d.maskHigh = (1 << nbBitsHigh) - 1
+			d.shiftHigh = (c - nbBitsHigh)
+		}
+		selectors[chunk] = d
+	}
+
+	// for each chunk, we could track the number of non-zeros points we will need to process
+	// this way, if a chunk has more work to do than others, we can spawn off more go routines
+	// (at the cost of more buckets allocated)
+	// a simplified approach is to track the small values where only the first word is set
+	// if this number represent a significant number of points, then we will split first chunk
+	// processing in the msm in 2, to ensure all go routines finish at ~same time
+	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
+	// if it does, though, this will deadlocK.
+	chSmallValues := make(chan int, nbTasks) 
+
+	parallel.Execute(len(scalars), func(start, end int) {
+		smallValues := 0
+		for i:=start; i < end; i++ {
+			var carry int
+
+			scalar := scalars[i]
+			if scalarsMont {
+				scalar.FromMont()
+			}
+			if scalar.FitsOnOneWord() {
+				// everything is 0, no need to process this scalar
+				if scalar[0] == 0 {
+					continue
+				}
+				// low c-bits are 1 in mask
+				if scalar[0]&mask == scalar[0] {
+					smallValues++
+				}
+			}
+
+			// for each chunk in the scalar, compute the current digit, and an eventual carry
+			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+				s := selectors[chunk]
+
+				// init with carry if any
+				digit := carry
+				carry = 0
+
+				// digit = value of the c-bit window
+				digit += int((scalar[s.index] & s.mask) >> s.shift)
+
+				if s.multiWordSelect {
+					// we are selecting bits over 2 words
+					digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh
+				}
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue 
+				}
+
+
+				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+				// 2^{c} to the current digit, making it negative.
+				if digit >= max {
+					digit -= (1 << c)
+					carry = 1
+				}
+
+				var bits uint32
+				if digit >= 0 {
+					bits = uint32(digit) << 1
+				} else {
+					bits = (uint32(-digit-1) << 1) + 1
+				}
+				toReturn[int(chunk)*len(scalars)+i] = bits
+				// [s.index] |= (bits << s.shift)
+				// if s.multiWordSelect {
+				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
+				// }
+
+			}
+		}
+
+		chSmallValues <- smallValues
+
+	}, nbTasks)
+	
+	
+	// aggregate small values
+	close(chSmallValues)
+	smallValues := 0
+	for o := range chSmallValues {
+		smallValues+=o
+	}
+	return toReturn, smallValues
+}
+
+
+// partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
+// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
+// 2^{c} to the current digit, making it negative.
+// negative digits can be processed in a later step as adding -G into the bucket instead of G
+// (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
+// scalarsMont indicates wheter the provided scalars are in montgomery form
+// returns smallValues, which represent the number of scalars which meets the following condition
+// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
+func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]fr.Element, int) {
 	toReturn := make([]fr.Element, len(scalars))
 
 	// number of c-bit radixes in a scalar
@@ -160,9 +289,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	return toReturn, smallValues
 }
 
-
-
-
 {{define "multiexp" }}
 
 
@@ -249,55 +375,49 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		return C
 	}
 
-	var C uint64
-	nbSplits := 1
-	nbChunks := 0
-	for nbChunks < config.NbTasks {
-		C = bestC(nbPoints)
-		nbChunks = int(fr.Limbs * 64 / C)  // number of c-bit radixes in a scalar
-		if (fr.Limbs * 64) % C != 0 {
-			nbChunks ++
-		}
-		nbChunks *= nbSplits
-		if nbChunks < config.NbTasks {
-			nbSplits <<= 1
-			nbPoints >>= 1 
-		}
+	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
+	// nbSplits := 1
+	C := bestC(nbPoints)
+	nbChunks := int(fr.Limbs * 64 / C)  // number of c-bit radixes in a scalar
+	if (fr.Limbs * 64) % C != 0 {
+		nbChunks ++
 	}
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	var smallValues int 
-	scalars, smallValues = partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	// var smallValues int 
+	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
+	innerMsm{{ $.UPointName }}(p, int(C), points, pscalars, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
-	_p := make([]{{ $.TJacobian }}, nbSplits - 1)
-	chDone := make(chan int, nbSplits - 1)
-	for i:=0; i < nbSplits-1; i++ {
-		start := i * nbPoints
-		end := start + nbPoints
-		go func(start, end, i int) {
-			innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-			chDone <- i
-		}(start, end, i)
-	}
+	
 
-	innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	for i:=0; i < nbSplits-1; i++ {
-		done := <-chDone
-		p.AddAssign(&_p[done])
-	}
-	close(chDone)
+	// _p := make([]{{ $.TJacobian }}, nbSplits - 1)
+	// chDone := make(chan int, nbSplits - 1)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	start := i * nbPoints
+	// 	end := start + nbPoints
+	// 	go func(start, end, i int) {
+	// 		innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
+	// 		chDone <- i
+	// 	}(start, end, i)
+	// }
+
+	// innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
+	// for i:=0; i < nbSplits-1; i++ {
+	// 	done := <-chDone
+	// 	p.AddAssign(&_p[done])
+	// }
+	// close(chDone)
 	return p, nil
 }
 
 
-func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool)  {
+func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool)  {
 	{{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}}
 	{{- /* also need to determine until which window size the ext-jacobian version is worth it. */}}
 	switch c {
@@ -310,14 +430,14 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}]
 		{{- end}}
 		{{- if eq $c $lc}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processChunk)
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processChunk)
 		{{- else}}
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
 				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}]
 			{{- end}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, scalars, splitFirstChunk, processChunk, processLastChunk)
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
 	{{- end}}
 	default:
@@ -325,8 +445,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 	}
 }
 
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element)) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32)) *{{ $.TJacobian }} {
 	
 	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -345,10 +465,11 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	}
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, scalars)
+	n := len(points)
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j >0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, scalars)
+		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -356,12 +477,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0,chChunks[0], c, points, scalars)
+			go processChunk(0,chChunks[0], c, points, pscalars[:n])
 		} else {
 			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
-			split := len(points) / 2
-			go processChunk(0,chSplit, c, points[:split], scalars[:split])
-			go processChunk(0,chSplit, c, points[split:], scalars[split:])
+			split := n / 2
+			go processChunk(0,chSplit, c, points[:split], pscalars[:split])
+			go processChunk(0,chSplit, c, points[split:], pscalars[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 0c0fba41e5..518c7e3406 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -6,9 +6,6 @@
 {{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }}
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
-)
 
 const MAX_BATCH_SIZE = 600
 
@@ -40,35 +37,18 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
-	 scalars []fr.Element) {
+	 pscalars []uint32) {
 
-	mask  := uint64((1 << c) - 1)	// low c bits are 1
-	msbWindow := uint64(1 << (c - 1))
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	jc := uint64(chunk * c)
-	s := selector{}
-	s.index = jc / 64
-	s.shift = jc - (s.index * 64)
-	s.mask = mask << s.shift
-	s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
-	if s.multiWordSelect {
-		nbBitsHigh := s.shift - uint64(64-c)
-		s.maskHigh = (1 << nbBitsHigh) - 1
-		s.shiftHigh = (c - nbBitsHigh)
-	}
-
 	batch := newBatch{{ $.TAffine }}(&buckets, points)
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(scalars); i++ {
-		bits := (scalars[i][s.index] & s.mask) >> s.shift
-		if s.multiWordSelect {
-			bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-		}
+	for i := 0; i < len(pscalars); i++ {
+		bits := pscalars[i]
 
 		if bits == 0 {
 			continue
@@ -76,13 +56,13 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&msbWindow == 0 {
+		if bits&1 == 0 {
 			// add
-			op.bucketID = uint32(bits - 1)
+			op.bucketID = uint32((bits>>1) - 1)
 			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
 		} else {
 			// sub
-			op.bucketID = (uint32(bits & ^msbWindow))
+			op.bucketID = (uint32((bits>>1)))
 			op.pointID += 1
 			// op.isNeg = true
 			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index d4e00fa442..8fb94f9f5b 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -6,9 +6,6 @@
 {{ $G2TJacobian := print (toUpper .G2.PointName) "Jac" }}
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
-import (
-	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
-)
 
 
 {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
@@ -22,48 +19,30 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 	chRes chan<- {{ $.TJacobianExtended }},
 	c uint64,
 	points []{{ $.TAffine }},
-	scalars []fr.Element) {
+	pscalars []uint32) {
 
 
-   mask  := uint64((1 << c) - 1)	// low c bits are 1
-   msbWindow  := uint64(1 << (c -1))
 
    var buckets B
    for i := 0 ; i < len(buckets); i++ {
 	   buckets[i].setInfinity()
    }
 
-   jc := uint64(chunk * c)
-   s := selector{}
-   s.index = jc / 64
-   s.shift = jc - (s.index * 64)
-   s.mask = mask << s.shift
-   s.multiWordSelect = (64 %c)!=0   && s.shift > (64-c) && s.index < (fr.Limbs - 1 )
-   if s.multiWordSelect {
-	   nbBitsHigh := s.shift - uint64(64-c)
-	   s.maskHigh = (1 << nbBitsHigh) - 1
-	   s.shiftHigh = (c - nbBitsHigh)
-   }
-
-
    // for each scalars, get the digit corresponding to the chunk we're processing.
-   for i := 0; i < len(scalars); i++ {
-	   bits := (scalars[i][s.index] & s.mask) >> s.shift
-	   if s.multiWordSelect {
-		   bits += (scalars[i][s.index+1] & s.maskHigh) << s.shiftHigh
-	   }
+   for i := 0; i < len(pscalars); i++ {
+	   bits := pscalars[i]
 
 	   if bits == 0 {
 		   continue
 	   }
 
 	   // if msbWindow bit is set, we need to substract
-	   if bits & msbWindow == 0 {
+	   if bits & 1 == 0 {
 		   // add
-		   buckets[bits-1].addMixed(&points[i])
+		   buckets[(bits>>1)-1].addMixed(&points[i])
 	   } else {
 		   // sub
-		   buckets[bits & ^msbWindow].subMixed(&points[i])
+		   buckets[(bits>>1)].subMixed(&points[i])
 	   }
    }
 
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index bbc5ec8980..9fc9cc1651 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1480,7 +1480,7 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 		baseTable[i].AddMixed(base)
 	}
 
-	pScalars, _ := partitionScalars(scalars, c, false, runtime.NumCPU())
+	pScalars, _ := partitionScalarsOld(scalars, c, false, runtime.NumCPU())
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index ef51368fbc..5fa8d37944 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -91,7 +91,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 
 			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePoints[:], scalars16, true)
+			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], scalars16, true)
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})

From c8613e89b60910631bb906406594751e099eae3a Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 8 Nov 2022 16:59:58 -0600
Subject: [PATCH 07/43] feat: gymnastic to ensure buckets are on the stack --
 compiler hints

---
 ecc/bls12-377/multiexp.go                     | 100 ++--
 ecc/bls12-377/multiexp_affine.go              | 456 ++++++++----------
 ecc/bls12-377/multiexp_jacobian.go            |  28 +-
 ecc/bls12-377/multiexp_test.go                |   4 +-
 ecc/bls12-378/multiexp.go                     | 100 ++--
 ecc/bls12-378/multiexp_affine.go              | 456 ++++++++----------
 ecc/bls12-378/multiexp_jacobian.go            |  28 +-
 ecc/bls12-378/multiexp_test.go                |   4 +-
 ecc/bls12-381/multiexp.go                     | 100 ++--
 ecc/bls12-381/multiexp_affine.go              | 456 ++++++++----------
 ecc/bls12-381/multiexp_jacobian.go            |  28 +-
 ecc/bls12-381/multiexp_test.go                |   4 +-
 ecc/bls24-315/multiexp.go                     | 100 ++--
 ecc/bls24-315/multiexp_affine.go              | 456 ++++++++----------
 ecc/bls24-315/multiexp_jacobian.go            |  28 +-
 ecc/bls24-315/multiexp_test.go                |   4 +-
 ecc/bls24-317/multiexp.go                     | 100 ++--
 ecc/bls24-317/multiexp_affine.go              | 456 ++++++++----------
 ecc/bls24-317/multiexp_jacobian.go            |  28 +-
 ecc/bls24-317/multiexp_test.go                |   4 +-
 ecc/bn254/multiexp.go                         | 100 ++--
 ecc/bn254/multiexp_affine.go                  | 456 ++++++++----------
 ecc/bn254/multiexp_jacobian.go                |  28 +-
 ecc/bn254/multiexp_test.go                    |   4 +-
 ecc/bw6-633/multiexp.go                       |  56 +--
 ecc/bw6-633/multiexp_affine.go                | 456 ++++++++----------
 ecc/bw6-633/multiexp_jacobian.go              |  28 +-
 ecc/bw6-633/multiexp_test.go                  |   4 +-
 ecc/bw6-756/multiexp.go                       |  56 +--
 ecc/bw6-756/multiexp_affine.go                | 456 ++++++++----------
 ecc/bw6-756/multiexp_jacobian.go              |  28 +-
 ecc/bw6-756/multiexp_test.go                  |   4 +-
 ecc/bw6-761/multiexp.go                       |  56 +--
 ecc/bw6-761/multiexp_affine.go                | 456 ++++++++----------
 ecc/bw6-761/multiexp_jacobian.go              |  28 +-
 ecc/bw6-761/multiexp_test.go                  |   4 +-
 .../generator/ecc/template/multiexp.go.tmpl   |  24 +-
 .../ecc/template/multiexp_affine.go.tmpl      | 241 +++++----
 .../ecc/template/multiexp_jacobian.go.tmpl    |  14 +-
 .../ecc/template/tests/multiexp.go.tmpl       |   2 +-
 40 files changed, 2532 insertions(+), 2909 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 4fad52e512..89db336b35 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index c3f89f7406..c2e56a6936 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index 1981509ee0..2c95e7f536 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index fe84ecc91f..afc5108951 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 200a5fc096..5914a5a0d6 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 06c5f74bfe..64ca8320b2 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 592070f11d..3ce29436eb 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index a94fc2e0b9..a77f7097e1 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 50b716e3c4..7f730ca946 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 3f0eeabba3..c965e24de2 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index 3840228907..7c69354658 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 8e356fde9d..457546524f 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 912d69ba44..37b43c6fe8 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 93b8a89cdb..10c47b3306 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index be1b9a8b65..6663cc9e73 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 352e8122b6..1b697194a5 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index f2c3d767a1..5cb36f1788 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 2f19119cc6..2833b83137 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index a79434b537..fccf3e949d 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index feb0b0c87f..eb190a317b 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 19b6fd8496..e3c84b390f 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,73 +146,73 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
 		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -232,10 +232,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -243,12 +243,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -372,12 +372,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -400,73 +400,73 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 20:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
 		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 21:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -486,10 +486,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -497,12 +497,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 0028bf14dd..5fea03fa0e 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -145,162 +234,144 @@ type ibG1Affine interface {
 		bucketG1AffineC21
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -309,14 +380,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -370,110 +441,3 @@ type ibG2Affine interface {
 		bucketG2AffineC20 |
 		bucketG2AffineC21
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 2b0db816d2..ef34f5faad 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -105,7 +103,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -113,20 +111,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index a9df7af3fa..05d133cdbe 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 9c168de3c5..b8fdb1314f 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,28 +146,28 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -187,10 +187,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -198,12 +198,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -327,12 +327,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -355,28 +355,28 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -396,10 +396,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -407,12 +407,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 1cc44fcaa6..8305011734 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -123,162 +212,144 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -326,110 +397,3 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index 996352830f..29756cc499 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 5fd9a64f32..75e1904bf0 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 04a630a92a..98b3867477 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,29 +146,29 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -188,10 +188,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -199,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -328,12 +328,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -356,29 +356,29 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -398,10 +398,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -409,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 2a439173b9..8b01509015 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -123,162 +212,144 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -326,110 +397,3 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 1f7ec4b3f8..10a354ae58 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index c8ddd47a9f..c37051d70d 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index b0fb81e0e6..9a41a9176f 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -118,12 +118,12 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G1Jac, nbSplits - 1)
@@ -146,29 +146,29 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, pscalars []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -188,10 +188,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -199,12 +199,12 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
@@ -328,12 +328,12 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 
 	// _p := make([]G2Jac, nbSplits - 1)
@@ -356,29 +356,29 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, pscalars []uint32, splitFirstChunk bool) {
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 16, points, pscalars, splitFirstChunk, processChunk, processChunk)
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, pscalars []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -398,10 +398,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -409,12 +409,12 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, pscalars []uint32, split
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, pscalars[:n])
+			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0, chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0, chSplit, c, points[:split], digits[:split])
+			go processChunk(0, chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 6fec5d8f63..6cdd72b7a4 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -36,45 +36,134 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatchG1Affine(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G1Affine // ...
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func() {
+		if cptP == 0 {
+			return
+		}
+		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -83,14 +172,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG1Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -123,162 +212,144 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
-type BatchG1Affine[B ibG1Affine] struct {
-	P         [MAX_BATCH_SIZE]G1Affine
-	R         [MAX_BATCH_SIZE]*G1Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G1Affine
-	buckets   *B
-}
+// processChunkG2BatchAffine process a chunk of the scalars during the msm
+// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
+// we use a batch affine addition.
+//
+// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
+// See Section 5.3: ia.cr/2022/1396
+func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+	chRes chan<- g2JacExtended,
+	c uint64,
+	points []G2Affine,
+	digits []uint32) {
+
+	// init the buckets
+	var buckets B
+	for i := 0; i < len(buckets); i++ {
+		buckets[i].setInfinity()
+	}
 
-func newBatchG1Affine[B ibG1Affine](buckets *B, points []G1Affine) BatchG1Affine[B] {
-	batchSize := len(*buckets) / 5
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	return BatchG1Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0                                              // count the number of point added to current batch
 
-func (b *BatchG1Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
+	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
+	var R [MAX_BATCH_SIZE]*G2Affine // ...
 
-func (b *BatchG1Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
 	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG1Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG1Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
 
-func (b *BatchG1Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
+	isFull := func() bool {
+		return cptP == batchSize
 	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
+
+	executeAndReset := func() {
+		if cptP == 0 {
 			return
 		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
+		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
 		}
+		cptP = 0
 	}
 
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
-func processQueueG1Affine[B ibG1Affine](queue []batchOp, batch *BatchG1Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
 			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
 		}
-	}
-	return queue
 
-}
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
 
-// processChunkG2BatchAffine process a chunk of the scalars during the msm
-// using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
-// we use a batch affine addition.
-//
-// this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
-// See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
-	chRes chan<- g2JacExtended,
-	c uint64,
-	points []G2Affine,
-	pscalars []uint32) {
+	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 
-	var buckets B
-	for i := 0; i < len(buckets); i++ {
-		buckets[i].setInfinity()
+	processQueue := func() {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
 	}
 
-	batch := newBatchG2Affine(&buckets, points)
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits >> 1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits >> 1)))
+			op.bucketID = (uint32((digit >> 1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -287,14 +358,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueueG2Affine(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -326,110 +397,3 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
-
-type BatchG2Affine[B ibG2Affine] struct {
-	P         [MAX_BATCH_SIZE]G2Affine
-	R         [MAX_BATCH_SIZE]*G2Affine
-	batchSize int
-	cptP      int
-	bucketIds map[uint32]struct{}
-	points    []G2Affine
-	buckets   *B
-}
-
-func newBatchG2Affine[B ibG2Affine](buckets *B, points []G2Affine) BatchG2Affine[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return BatchG2Affine[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *BatchG2Affine[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *BatchG2Affine[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAddG2Affine(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *BatchG2Affine[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *BatchG2Affine[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueueG2Affine[B ibG2Affine](queue []batchOp, batch *BatchG2Affine[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 48249ca28f..045bace5e7 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -28,20 +28,18 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
@@ -79,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	pscalars []uint32) {
+	digits []uint32) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -87,20 +85,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	}
 
 	// for each scalars, get the digit corresponding to the chunk we're processing.
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
-
-		if bits == 0 {
+	for i, digit := range digits {
+		if digit == 0 {
 			continue
 		}
 
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			buckets[(bits>>1)-1].addMixed(&points[i])
+			buckets[(digit>>1)-1].addMixed(&points[i])
 		} else {
 			// sub
-			buckets[(bits >> 1)].subMixed(&points[i])
+			buckets[(digit >> 1)].subMixed(&points[i])
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index cbcc319e1a..b0a94d79dd 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -275,7 +275,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
@@ -606,7 +606,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index bd4a489361..6dca2eb861 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -387,12 +387,12 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int 
-	pscalars, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsm{{ $.UPointName }}(p, int(C), points, pscalars, splitFirstChunk)
+	innerMsm{{ $.UPointName }}(p, int(C), points, digits, splitFirstChunk)
 	// we have nbSplits intermediate results that we must sum together.
 	
 
@@ -417,7 +417,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 }
 
 
-func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool)  {
+func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool)  {
 	{{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}}
 	{{- /* also need to determine until which window size the ext-jacobian version is worth it. */}}
 	switch c {
@@ -430,14 +430,14 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}]
 		{{- end}}
 		{{- if eq $c $lc}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processChunk)
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk)
 		{{- else}}
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
 				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}]
 			{{- end}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, pscalars, splitFirstChunk, processChunk, processLastChunk)
+			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
 	{{- end}}
 	default:
@@ -445,8 +445,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 	}
 }
 
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, pscalars []uint32)) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint32)) *{{ $.TJacobian }} {
 	
 	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -466,10 +466,10 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, pscalars[int(nbChunks-1)*n:])
+	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j >0; j-- {
-		go processChunk(uint64(j), chChunks[j], c, points, pscalars[j*n:(j+1)*n])
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
@@ -477,12 +477,12 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
 		if !splitFirstChunk {
-			go processChunk(0,chChunks[0], c, points, pscalars[:n])
+			go processChunk(0,chChunks[0], c, points, digits[:n])
 		} else {
 			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
 			split := n / 2
-			go processChunk(0,chSplit, c, points[:split], pscalars[:split])
-			go processChunk(0,chSplit, c, points[split:], pscalars[split:n])
+			go processChunk(0,chSplit, c, points[:split], digits[:split])
+			go processChunk(0,chSplit, c, points[split:], digits[split:n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 518c7e3406..8fe8391308 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -7,6 +7,8 @@
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
 
+
+
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
@@ -37,45 +39,136 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
-	 pscalars []uint32) {
+	 digits []uint32) {
 
+	// init the buckets
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 	}
 
-	batch := newBatch{{ $.TAffine }}(&buckets, points)
+	// setup for the batch affine;
+	batchSize := len(buckets) / 5
+	if batchSize > MAX_BATCH_SIZE {
+		batchSize = MAX_BATCH_SIZE
+	}
+	if batchSize <= 0 {
+		batchSize = 1
+	}
+	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	cptP := 0 // count the number of point added to current batch
+
+	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack
+	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // ... 
+
+	canAdd := func(bID uint32) bool {
+		_, ok := bucketIds[bID]
+		return !ok
+	}
+
+	isFull := func() bool {
+		return cptP == batchSize
+	}
+
+	executeAndReset := func ()  {
+		if cptP == 0 {
+			return
+		}
+		BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP], cptP)
+		for k := range bucketIds {
+			delete(bucketIds, k)
+		}
+		cptP = 0
+	}
+
+	add := func(op batchOp) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+	
+		BK := &buckets[op.bucketID]
+		PP := &points[op.pointID>>1]
+		if PP.IsInfinity() {
+			return
+		}
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			if op.isNeg() {
+				BK.Neg(PP)
+			} else {
+				BK.Set(PP)
+			}
+			return
+		}
+		if op.isNeg() {
+			// if bucket == P --> -P == 0
+			if BK.Equal(PP) {
+				BK.setInfinity()
+				return
+			}
+		} else {
+			// if bucket == -P, B == 0
+			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
+				BK.setInfinity()
+				return
+			}
+		}
+	
+		// bucketIds[cptP] = op.bucketID
+		bucketIds[op.bucketID] = struct{}{}
+		R[cptP] = BK
+		if op.isNeg() {
+			P[cptP].Neg(PP)
+		} else {
+			P[cptP].Set(PP)
+		}
+		cptP++
+	}
+	
+
 	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+
+
+	processQueue := func () {
+		// for i := len(queue) - 1; i >= 0; i-- {
+		for i := 0; i < len(queue); i++ {
+			if canAdd(queue[i].bucketID) {
+				add(queue[i])
+				if isFull() {
+					executeAndReset()
+				}
+				queue[i] = queue[len(queue)-1]
+				queue = queue[:len(queue)-1]
+				i--
+			}
+		}
+	}
+
 	nbBatches := 0
-	for i := 0; i < len(pscalars); i++ {
-		bits := pscalars[i]
+	for i, digit := range digits {
 
-		if bits == 0 {
+		if digit == 0 {
 			continue
 		}
 
 		op := batchOp{pointID: uint32(i) << 1}
 		// if msbWindow bit is set, we need to substract
-		if bits&1 == 0 {
+		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((bits>>1) - 1)
-			// buckets[bits-1].Add(&points[i], &buckets[bits-1])
+			op.bucketID = uint32((digit>>1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((bits>>1)))
+			op.bucketID = (uint32((digit>>1)))
 			op.pointID += 1
-			// op.isNeg = true
-			// buckets[bits & ^msbWindow].Sub( &buckets[bits & ^msbWindow], &points[i])
 		}
-		if batch.CanAdd(op.bucketID) {
-			batch.Add(op)
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
+		if canAdd(op.bucketID) {
+			add(op)
+			if isFull() {
+				executeAndReset()
 				nbBatches++
 				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
-					batch.Add(queue[len(queue)-1])
+					add(queue[len(queue)-1])
 					queue = queue[:len(queue)-1]
 				}
+				// processQueue()
 			}
 		} else {
 			// put it in queue.
@@ -84,14 +177,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
 	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
-	// batch.ExecuteAndReset()
+	// executeAndReset()
 	for len(queue) != 0 {
-		queue = processQueue{{ $.TAffine }}(queue, &batch)
-		batch.ExecuteAndReset() // execute batch even if not full.
+		processQueue()
+		executeAndReset() // execute batch even if not full.
 	}
 
 	// flush items in batch.
-	batch.ExecuteAndReset()
+	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -122,112 +215,4 @@ type ib{{ $.TAffine }} interface {
 	{{- end}}
 }
 
-
-type Batch{{ $.TAffine }}[B ib{{ $.TAffine }}] struct {
-	P            [MAX_BATCH_SIZE]{{ $.TAffine }}
-	R            [MAX_BATCH_SIZE]*{{ $.TAffine }}
-	batchSize       int
-	cptP		      int
-	bucketIds       map[uint32]struct{}
-	points []{{ $.TAffine }}
-	buckets *B
-}
-
-func newBatch{{ $.TAffine }}[B ib{{ $.TAffine }}](buckets *B, points []{{ $.TAffine }}) Batch{{ $.TAffine }}[B] {
-	batchSize := len(*buckets) / 5
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
-	return Batch{{ $.TAffine }}[B]{
-		buckets:   buckets,
-		points:    points,
-		batchSize: batchSize,
-		bucketIds: make(map[uint32]struct{}, len(*buckets)/2),
-	}
-}
-
-func (b *Batch{{ $.TAffine }}[B]) IsFull() bool {
-	return b.cptP == b.batchSize
-}
-
-func (b *Batch{{ $.TAffine }}[B]) ExecuteAndReset() {
-	if b.cptP == 0 {
-		return
-	}
-	// for i := 0; i < len(b.R); i++ {
-	// 	b.R[i].Add(b.R[i], b.P[i])
-	// }
-	BatchAdd{{ $.TAffine }}(b.R[:b.cptP], b.P[:b.cptP], b.cptP)
-	for k := range b.bucketIds {
-		delete(b.bucketIds, k)
-	}
-	// b.bucketIds = [MAX_BATCH_SIZE]uint32{}
-	b.cptP = 0
-}
-
-func (b *Batch{{ $.TAffine }}[B]) CanAdd(bID uint32) bool {
-	_, ok := b.bucketIds[bID]
-	return !ok
-}
-
-func (b *Batch{{ $.TAffine }}[B]) Add(op batchOp) {
-	// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-	BK := &(*b.buckets)[op.bucketID]
-	P := &b.points[op.pointID>>1]
-	if P.IsInfinity() {
-		return
-	}
-	// handle special cases with inf or -P / P
-	if BK.IsInfinity() {
-		if op.isNeg() {
-			BK.Neg(P)
-		} else {
-			BK.Set(P)
-		}
-		return
-	}
-	if op.isNeg() {
-		// if bucket == P --> -P == 0
-		if BK.Equal(P) {
-			BK.setInfinity()
-			return
-		}
-	} else {
-		// if bucket == -P, B == 0
-		if BK.X.Equal(&P.X) && !BK.Y.Equal(&P.Y) {
-			BK.setInfinity()
-			return
-		}
-	}
-
-	// b.bucketIds[b.cptP] = op.bucketID
-	b.bucketIds[op.bucketID] = struct{}{}
-	b.R[b.cptP] = BK
-	if op.isNeg() {
-		b.P[b.cptP].Neg(P)
-	} else {
-		b.P[b.cptP].Set(P)
-	}
-	b.cptP++
-}
-
-func processQueue{{ $.TAffine }}[B ib{{ $.TAffine }}](queue []batchOp, batch *Batch{{ $.TAffine }}[B]) []batchOp {
-	for i := len(queue) - 1; i >= 0; i-- {
-		if batch.CanAdd(queue[i].bucketID) {
-			batch.Add(queue[i])
-			if batch.IsFull() {
-				batch.ExecuteAndReset()
-			}
-			queue[i] = queue[len(queue)-1]
-			queue = queue[:len(queue)-1]
-		}
-	}
-	return queue
-
-}
-
 {{end }}
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index 8fb94f9f5b..ee1f1d2080 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -19,7 +19,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 	chRes chan<- {{ $.TJacobianExtended }},
 	c uint64,
 	points []{{ $.TAffine }},
-	pscalars []uint32) {
+	digits []uint32) {
 
 
 
@@ -29,20 +29,18 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
    }
 
    // for each scalars, get the digit corresponding to the chunk we're processing.
-   for i := 0; i < len(pscalars); i++ {
-	   bits := pscalars[i]
-
-	   if bits == 0 {
+   for i, digit := range digits {
+	   if digit == 0 {
 		   continue
 	   }
 
 	   // if msbWindow bit is set, we need to substract
-	   if bits & 1 == 0 {
+	   if digit & 1 == 0 {
 		   // add
-		   buckets[(bits>>1)-1].addMixed(&points[i])
+		   buckets[(digit>>1)-1].addMixed(&points[i])
 	   } else {
 		   // sub
-		   buckets[(bits>>1)].subMixed(&points[i])
+		   buckets[(digit>>1)].subMixed(&points[i])
 	   }
    }
 
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 5fa8d37944..49f36bb6b3 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -287,7 +287,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 11; i <= pow; i++ {
+	for i := 14; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {

From 653877c58e38196aa069e434f90ad5a4a90ba9e0 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 9 Nov 2022 07:41:01 -0600
Subject: [PATCH 08/43] feat: toying with batch size

---
 ecc/bls12-377/multiexp_affine.go              | 38 +++++++++----------
 ecc/bls12-377/multiexp_test.go                |  8 +---
 ecc/bls12-378/multiexp_affine.go              | 38 +++++++++----------
 ecc/bls12-378/multiexp_test.go                |  8 +---
 ecc/bls12-381/multiexp_affine.go              | 38 +++++++++----------
 ecc/bls12-381/multiexp_test.go                |  8 +---
 ecc/bls24-315/multiexp_affine.go              | 38 +++++++++----------
 ecc/bls24-315/multiexp_test.go                |  8 +---
 ecc/bls24-317/multiexp_affine.go              | 38 +++++++++----------
 ecc/bls24-317/multiexp_test.go                |  8 +---
 ecc/bn254/multiexp_affine.go                  | 38 +++++++++----------
 ecc/bn254/multiexp_test.go                    |  8 +---
 ecc/bw6-633/multiexp_affine.go                | 38 +++++++++----------
 ecc/bw6-633/multiexp_test.go                  |  8 +---
 ecc/bw6-756/multiexp_affine.go                | 38 +++++++++----------
 ecc/bw6-756/multiexp_test.go                  |  8 +---
 ecc/bw6-761/multiexp_affine.go                | 38 +++++++++----------
 ecc/bw6-761/multiexp_test.go                  |  8 +---
 .../ecc/template/multiexp_affine.go.tmpl      | 22 +++++------
 .../ecc/template/tests/multiexp.go.tmpl       |  8 +---
 20 files changed, 201 insertions(+), 243 deletions(-)

diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index c2e56a6936..015864143d 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12377
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index afc5108951..72f274c242 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 64ca8320b2..fa5dc6c792 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12378
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index a77f7097e1..85f7b72e69 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index c965e24de2..f8ae40fb18 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12381
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 457546524f..dbcdc6eb85 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 10c47b3306..485f6960ff 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls24315
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 1b697194a5..b2978d6aa0 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 2833b83137..432592fa23 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls24317
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index eb190a317b..799b903db7 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 5fea03fa0e..42d6413264 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bn254
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -253,7 +253,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -328,7 +328,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -367,19 +367,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 05d133cdbe..7379ddccbd 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 8305011734..49484c36c9 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6633
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 75e1904bf0..b367a2fe2f 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 8b01509015..e1daea4ffe 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6756
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index c37051d70d..f82d71c32f 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 6cdd72b7a4..9edda2244b 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6761
 
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,7 +45,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -120,7 +120,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -159,19 +159,19 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
@@ -231,7 +231,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -306,7 +306,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
 
 	processQueue := func() {
 		// for i := len(queue) - 1; i >= 0; i-- {
@@ -345,19 +345,19 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index b0a94d79dd..884a8564f8 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -698,11 +698,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 8fe8391308..a075f3b083 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -7,9 +7,7 @@
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
 
-
-
-const MAX_BATCH_SIZE = 600
+const MAX_BATCH_SIZE = 2000
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -48,7 +46,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 5
+	batchSize := len(buckets) / 30
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
@@ -124,7 +122,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 	
 
-	queue := make([]batchOp, 0, 4096) // TODO find right capacity here.
+	queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 
 
 	processQueue := func () {
@@ -164,19 +162,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 			if isFull() {
 				executeAndReset()
 				nbBatches++
-				if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
-					add(queue[len(queue)-1])
-					queue = queue[:len(queue)-1]
-				}
-				// processQueue()
+				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
+				// 	add(queue[len(queue)-1])
+				// 	queue = queue[:len(queue)-1]
+				// }
+				processQueue()
 			}
 		} else {
 			// put it in queue.
 			queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n",
-	// 	chunk, len(queue), nbBatches, batch.batchSize, len(buckets), len(points))
+	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
+	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
 	for len(queue) != 0 {
 		processQueue()
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 49f36bb6b3..c673254b2f 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -384,11 +384,7 @@ func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }

From e43bb7674c04056ce08f91ef593b345d01ae86e4 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 9 Nov 2022 11:55:36 -0600
Subject: [PATCH 09/43] perf: msm affine OK on x86

---
 ecc/bls12-377/g1.go                           |  38 +++---
 ecc/bls12-377/g2.go                           |  38 +++---
 ecc/bls12-377/multiexp.go                     |  20 +--
 ecc/bls12-377/multiexp_affine.go              | 116 ++++++++++--------
 ecc/bls12-377/multiexp_jacobian.go            |  12 +-
 ecc/bls12-377/multiexp_test.go                |  28 ++---
 ecc/bls12-378/g1.go                           |  38 +++---
 ecc/bls12-378/g2.go                           |  38 +++---
 ecc/bls12-378/multiexp.go                     |  20 +--
 ecc/bls12-378/multiexp_affine.go              | 116 ++++++++++--------
 ecc/bls12-378/multiexp_jacobian.go            |  12 +-
 ecc/bls12-378/multiexp_test.go                |  28 ++---
 ecc/bls12-381/g1.go                           |  38 +++---
 ecc/bls12-381/g2.go                           |  38 +++---
 ecc/bls12-381/multiexp.go                     |  20 +--
 ecc/bls12-381/multiexp_affine.go              | 116 ++++++++++--------
 ecc/bls12-381/multiexp_jacobian.go            |  12 +-
 ecc/bls12-381/multiexp_test.go                |  28 ++---
 ecc/bls24-315/g1.go                           |  38 +++---
 ecc/bls24-315/g2.go                           |  38 +++---
 ecc/bls24-315/multiexp.go                     |  20 +--
 ecc/bls24-315/multiexp_affine.go              | 116 ++++++++++--------
 ecc/bls24-315/multiexp_jacobian.go            |  12 +-
 ecc/bls24-315/multiexp_test.go                |  28 ++---
 ecc/bls24-317/g1.go                           |  38 +++---
 ecc/bls24-317/g2.go                           |  38 +++---
 ecc/bls24-317/multiexp.go                     |  20 +--
 ecc/bls24-317/multiexp_affine.go              | 116 ++++++++++--------
 ecc/bls24-317/multiexp_jacobian.go            |  12 +-
 ecc/bls24-317/multiexp_test.go                |  28 ++---
 ecc/bn254/g1.go                               |  38 +++---
 ecc/bn254/g2.go                               |  38 +++---
 ecc/bn254/multiexp.go                         |  20 +--
 ecc/bn254/multiexp_affine.go                  | 116 ++++++++++--------
 ecc/bn254/multiexp_jacobian.go                |  12 +-
 ecc/bn254/multiexp_test.go                    |  28 ++---
 ecc/bw6-633/g1.go                             |  38 +++---
 ecc/bw6-633/g2.go                             |  38 +++---
 ecc/bw6-633/multiexp_affine.go                | 104 ++++++++++------
 ecc/bw6-633/multiexp_test.go                  |  26 +---
 ecc/bw6-756/g1.go                             |  38 +++---
 ecc/bw6-756/g2.go                             |  38 +++---
 ecc/bw6-756/multiexp_affine.go                | 104 ++++++++++------
 ecc/bw6-756/multiexp_test.go                  |  26 +---
 ecc/bw6-761/g1.go                             |  38 +++---
 ecc/bw6-761/g2.go                             |  38 +++---
 ecc/bw6-761/multiexp_affine.go                | 104 ++++++++++------
 ecc/bw6-761/multiexp_test.go                  |  26 +---
 internal/generator/config/curve.go            |   2 +-
 .../ecc/template/multiexp_affine.go.tmpl      |  52 +++++---
 internal/generator/ecc/template/point.go.tmpl |  36 +++---
 .../ecc/template/tests/multiexp.go.tmpl       |  16 +--
 52 files changed, 1042 insertions(+), 1194 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 3b436a6b2b..962a527a2e 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 18810fe510..dd09808a13 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -979,31 +979,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E2
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E2
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E2
 	var rr G2Affine
 
@@ -1032,19 +1028,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E2, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+func batchInvertG2Affine(res, a []fptower.E2) {
 
 	var accumulator fptower.E2
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 89db336b35..98de6ca242 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 015864143d..66c9d4e8bf 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12377
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index 2c95e7f536..be722067bd 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 72f274c242..ca2e50f59c 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 5b9ec0f84f..8422e95efb 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 0010b3983b..9cca73e6b3 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -979,31 +979,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E2
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E2
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E2
 	var rr G2Affine
 
@@ -1032,19 +1028,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E2, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+func batchInvertG2Affine(res, a []fptower.E2) {
 
 	var accumulator fptower.E2
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 5914a5a0d6..917f493796 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index fa5dc6c792..a48d9d1cfd 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12378
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 3ce29436eb..6a8cfa2d32 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 85f7b72e69..339323bbac 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index eccf0c9c97..bb37dacb65 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -983,31 +983,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1036,19 +1032,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 5264766d99..86ce9db5b6 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -980,31 +980,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E2
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E2
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E2
 	var rr G2Affine
 
@@ -1033,19 +1029,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E2, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+func batchInvertG2Affine(res, a []fptower.E2) {
 
 	var accumulator fptower.E2
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 7f730ca946..8283ce4957 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index f8ae40fb18..d30c10293e 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls12381
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index 7c69354658..fabbf2d237 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index dbcdc6eb85..ce2153872e 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index 173d24e902..e55d4ad4cb 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -985,31 +985,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1038,19 +1034,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index e498978c0b..f5dffd0752 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -995,31 +995,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E4
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E4
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E4
 	var rr G2Affine
 
@@ -1048,19 +1044,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E4, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) {
+func batchInvertG2Affine(res, a []fptower.E4) {
 
 	var accumulator fptower.E4
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 37b43c6fe8..922c80cd89 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 485f6960ff..c7aa56e2d5 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls24315
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 6663cc9e73..a3d633de01 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index b2978d6aa0..f8513bd3a1 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index 9443125d34..58bee14819 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -985,31 +985,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1038,19 +1034,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 0e2738e211..f5fb993fb4 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -995,31 +995,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E4
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E4
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E4
 	var rr G2Affine
 
@@ -1048,19 +1044,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E4, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E4, n int) {
+func batchInvertG2Affine(res, a []fptower.E4) {
 
 	var accumulator fptower.E4
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 5cb36f1788..923946e34f 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 432592fa23..ccd70a9474 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bls24317
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index fccf3e949d..7e832db4e7 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 799b903db7..214d884e11 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 0844716e0f..80cec53604 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -955,31 +955,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1008,19 +1004,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 23203fd92c..79215583d5 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -984,31 +984,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fptower.E2
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fptower.E2
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fptower.E2
 	var rr G2Affine
 
@@ -1037,19 +1033,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfptower.E2, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fptower.E2, n int) {
+func batchInvertG2Affine(res, a []fptower.E2) {
 
 	var accumulator fptower.E2
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index e3c84b390f..410e4016ab 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -198,14 +198,6 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC20]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC16]
-		_innerMsmG1(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC21]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -338,7 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+		implementedCs := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -452,14 +444,6 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
-	case 20:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC20]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC16]
-		_innerMsmG2(p, 20, points, digits, splitFirstChunk, processChunk, processLastChunk)
-	case 21:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC21]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 21, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 42d6413264..62c70e876a 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bn254
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -213,8 +226,6 @@ type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
 type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
-type bucketG1AffineC20 [1 << (20 - 1)]G1Affine
-type bucketG1AffineC21 [1 << (21 - 1)]G1Affine
 
 type ibG1Affine interface {
 	bucketG1AffineC4 |
@@ -229,9 +240,7 @@ type ibG1Affine interface {
 		bucketG1AffineC13 |
 		bucketG1AffineC14 |
 		bucketG1AffineC15 |
-		bucketG1AffineC16 |
-		bucketG1AffineC20 |
-		bucketG1AffineC21
+		bucketG1AffineC16
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -253,15 +262,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -279,7 +288,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -328,24 +337,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -366,22 +387,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -421,8 +443,6 @@ type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
 type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
-type bucketG2AffineC20 [1 << (20 - 1)]G2Affine
-type bucketG2AffineC21 [1 << (21 - 1)]G2Affine
 
 type ibG2Affine interface {
 	bucketG2AffineC4 |
@@ -437,7 +457,5 @@ type ibG2Affine interface {
 		bucketG2AffineC13 |
 		bucketG2AffineC14 |
 		bucketG2AffineC15 |
-		bucketG2AffineC16 |
-		bucketG2AffineC20 |
-		bucketG2AffineC21
+		bucketG2AffineC16
 }
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index ef34f5faad..a682232ec6 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -74,8 +74,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC20 [1 << (20 - 1)]g1JacExtended
-type bucketg1JacExtendedC21 [1 << (21 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
@@ -94,9 +92,7 @@ type ibg1JacExtended interface {
 		bucketg1JacExtendedC13 |
 		bucketg1JacExtendedC14 |
 		bucketg1JacExtendedC15 |
-		bucketg1JacExtendedC16 |
-		bucketg1JacExtendedC20 |
-		bucketg1JacExtendedC21
+		bucketg1JacExtendedC16
 }
 
 func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
@@ -157,8 +153,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC20 [1 << (20 - 1)]g2JacExtended
-type bucketg2JacExtendedC21 [1 << (21 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
@@ -177,7 +171,5 @@ type ibg2JacExtended interface {
 		bucketg2JacExtendedC13 |
 		bucketg2JacExtendedC14 |
 		bucketg2JacExtendedC15 |
-		bucketg2JacExtendedC16 |
-		bucketg2JacExtendedC20 |
-		bucketg2JacExtendedC21
+		bucketg2JacExtendedC16
 }
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 7379ddccbd..bf12818dd5 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -102,7 +102,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index 860ce3e355..41a18cf2af 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1087,31 +1087,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1140,19 +1136,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index 12579994ab..de70170a12 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -950,31 +950,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G2Affine
 
@@ -1003,19 +999,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG2Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 49484c36c9..1a7d1b4abe 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6633
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index b367a2fe2f..38f438c71e 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index d53b7f5f82..e1c7e9056a 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1087,31 +1087,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1140,19 +1136,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 049841f4f7..5302819c4b 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -944,31 +944,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G2Affine
 
@@ -997,19 +993,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG2Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index e1daea4ffe..93b394e246 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6756
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index f82d71c32f..ee8c765cd2 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 8694980eda..86b99ebd1a 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1098,31 +1098,27 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
+func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG1Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G1Affine
 
@@ -1151,19 +1147,19 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG1Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG1Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 3198411f9e..77c4e1d375 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -958,31 +958,27 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
+func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]fp.Element
-
-	{
-		var lambdain [MAX_BATCH_SIZE]fp.Element
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
-		}
-
-		// invert denominator
-		BatchInvertG2Affine(&lambda, &lambdain, batchSize)
+	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
+		}
 	}
 
+	// invert denominator
+	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+
 	var d fp.Element
 	var rr G2Affine
 
@@ -1011,19 +1007,19 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine, batchSize int) {
 
 // batch inversion
 // similar to BatchInvertfp.Element, ignores edge cases
-func BatchInvertG2Affine(res, a *[MAX_BATCH_SIZE]fp.Element, n int) {
+func batchInvertG2Affine(res, a []fp.Element) {
 
 	var accumulator fp.Element
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 9edda2244b..cdd2c92daf 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -16,7 +16,7 @@
 
 package bw6761
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -45,15 +45,15 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G1Affine // ...
@@ -71,7 +71,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG1Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -120,24 +120,36 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -158,22 +170,23 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
@@ -231,15 +244,15 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
-	cptP := 0                                              // count the number of point added to current batch
+	bucketIds := make(map[uint32]struct{}, batchSize)
+	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
 	var R [MAX_BATCH_SIZE]*G2Affine // ...
@@ -257,7 +270,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		if cptP == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP], cptP)
+		BatchAddG2Affine(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -306,24 +319,36 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	queue := make([]batchOp, 0, 20*batchSize) // TODO find right capacity here.
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func() {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -344,22 +369,23 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op
+			qID++
+			if qID == MAX_BATCH_SIZE-1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 884a8564f8..5cab324216 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -275,17 +275,10 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -295,7 +288,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -606,17 +599,10 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
@@ -626,7 +612,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints  [nbSamples]G2Affine
diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go
index e1df940957..1ff4926ccf 100644
--- a/internal/generator/config/curve.go
+++ b/internal/generator/config/curve.go
@@ -68,7 +68,7 @@ var TwistedEdwardsCurves []TwistedEdwardsCurve
 
 func defaultCRange() []int {
 	// default range for C values in the multiExp
-	return []int{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 20, 21}
+	return []int{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 }
 
 func addCurve(c *Curve) {
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index a075f3b083..897401430f 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -7,7 +7,7 @@
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
 
-const MAX_BATCH_SIZE = 2000
+const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
 	bucketID, pointID uint32
@@ -46,14 +46,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 
 	// setup for the batch affine;
-	batchSize := len(buckets) / 30
+	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
 	}
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, len(buckets)/2) // TODO @gbotrel tune the capacity here
+	bucketIds := make(map[uint32]struct{}, batchSize) 
 	cptP := 0 // count the number of point added to current batch
 
 	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack
@@ -72,7 +72,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 		if cptP == 0 {
 			return
 		}
-		BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP], cptP)
+		BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP])
 		for k := range bucketIds {
 			delete(bucketIds, k)
 		}
@@ -122,25 +122,36 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 	
 
-	queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
-
+	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
+	var queue [MAX_BATCH_SIZE]batchOp
+	qID := 0
 
 	processQueue := func () {
-		// for i := len(queue) - 1; i >= 0; i-- {
-		for i := 0; i < len(queue); i++ {
+		for i := qID - 1; i >= 0; i-- {
 			if canAdd(queue[i].bucketID) {
 				add(queue[i])
 				if isFull() {
 					executeAndReset()
 				}
-				queue[i] = queue[len(queue)-1]
-				queue = queue[:len(queue)-1]
-				i--
+				queue[i] = queue[qID-1]
+				qID--
 			}
 		}
 	}
 
-	nbBatches := 0
+	processTopQueue := func() {
+		for i := qID - 1; i >= 0; i-- {
+			if !canAdd(queue[i].bucketID) {
+				return
+			}
+			add(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			qID--
+		}
+	}
+
 	for i, digit := range digits {
 
 		if digit == 0 {
@@ -161,22 +172,23 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 			add(op)
 			if isFull() {
 				executeAndReset()
-				nbBatches++
-				// if len(queue) != 0 { // TODO @gbotrel this doesn't seem to help much? should minimize queue resizing 
-				// 	add(queue[len(queue)-1])
-				// 	queue = queue[:len(queue)-1]
-				// }
-				processQueue()
+				processTopQueue()
 			}
 		} else {
 			// put it in queue.
-			queue = append(queue, op)
+			queue[qID] = op 
+			qID++
+			if qID == MAX_BATCH_SIZE - 1 {
+				executeAndReset()
+				processQueue()
+			}
+			// queue = append(queue, op)
 		}
 	}
 	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
 	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
 	// executeAndReset()
-	for len(queue) != 0 {
+	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
 	}
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index 9fc9cc1651..c88ca29b88 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1574,32 +1574,28 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 // batch add/dbl in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize int) {
+func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var isDbl [MAX_BATCH_SIZE]bool
-	var lambda [MAX_BATCH_SIZE]{{.CoordType}}
+	var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}}
 
-	{
-		var lambdain [MAX_BATCH_SIZE]{{.CoordType}}
 
-
-		for j := 0; j < batchSize; j++ {
-			// detect dbl vs add & compute denominator
-			if P[j].Equal(R[j]) {
-				isDbl[j] = true
-				lambdain[j].Double(&P[j].Y)
-			} else {
-				lambdain[j].Sub(&P[j].X, &R[j].X)
-			}
+	for j := 0; j < batchSize; j++ {
+		// detect dbl vs add & compute denominator
+		if P[j].Equal(R[j]) {
+			isDbl[j] = true
+			lambdain[j].Double(&P[j].Y)
+		} else {
+			lambdain[j].Sub(&P[j].X, &R[j].X)
 		}
-
-		// invert denominator
-		BatchInvert{{ $TAffine }}(&lambda, &lambdain, batchSize)
-
 	}
 
+	// invert denominator
+	batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize])
+
 	var d {{.CoordType}}
 	var rr {{ $TAffine }}
 
@@ -1630,19 +1626,19 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}, batchSize i
 
 // batch inversion
 // similar to BatchInvert{{.CoordType}}, ignores edge cases
-func BatchInvert{{ $TAffine }}(res, a *[MAX_BATCH_SIZE]{{.CoordType}},  n int) {
+func batchInvert{{ $TAffine }}(res, a []{{.CoordType}}) {
 
     var accumulator {{.CoordType}}
 	accumulator.SetOne()
 
-	for i := 0; i < n; i++ {
+	for i := 0; i < len(res); i++ {
 		res[i] = accumulator
 		accumulator.Mul(&accumulator, &a[i])
 	}
 
 	accumulator.Inverse(&accumulator)
 
-	for i := n - 1; i >= 0; i-- {
+	for i := len(res) - 1; i >= 0; i-- {
 		res[i].Mul(&res[i], &accumulator)
 		accumulator.Mul(&accumulator, &a[i])
 	}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index c673254b2f..f75da11996 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -269,6 +269,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+	
+
 
 func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
@@ -285,19 +287,13 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	fillBenchScalars(sampleScalars[:])
 	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
 
+
 	var testPoint {{ $.TAffine }}
 
-	for i := 14; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
-		b.Run(fmt.Sprintf("%d points ext-jacobian", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points affine", using), func(b *testing.B) {
+		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
 			b.ResetTimer()
 			for j := 0; j < b.N; j++ {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
@@ -308,7 +304,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 
 func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
-	const nbSamples = 1 << 20
+	const nbSamples = 1 << 23
 
 	var (
 		samplePoints [nbSamples]{{ $.TAffine }}

From bc85933359b7e97d48eb5b3f7677c5514f266bab Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 9 Nov 2022 12:27:45 -0600
Subject: [PATCH 10/43] test: gen scalars and bases in parallel

---
 ecc/bls12-377/multiexp_test.go                | 55 ++++++++++---------
 ecc/bls12-378/multiexp_test.go                | 55 ++++++++++---------
 ecc/bls12-381/multiexp_test.go                | 55 ++++++++++---------
 ecc/bls24-315/multiexp_test.go                | 55 ++++++++++---------
 ecc/bls24-317/multiexp_test.go                | 55 ++++++++++---------
 ecc/bn254/multiexp_test.go                    | 55 ++++++++++---------
 ecc/bw6-633/multiexp_test.go                  | 55 ++++++++++---------
 ecc/bw6-756/multiexp_test.go                  | 55 ++++++++++---------
 ecc/bw6-761/multiexp_test.go                  | 55 ++++++++++---------
 .../ecc/template/tests/multiexp.go.tmpl       | 33 ++++++-----
 10 files changed, 289 insertions(+), 239 deletions(-)

diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index ca2e50f59c..e9e7fd9c67 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -17,7 +17,9 @@
 package bls12377
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 339323bbac..65729bd9c1 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -17,7 +17,9 @@
 package bls12378
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index ce2153872e..4b357be4d9 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -17,7 +17,9 @@
 package bls12381
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index f8513bd3a1..0773215145 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -17,7 +17,9 @@
 package bls24315
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 214d884e11..9ebe0c4217 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -17,7 +17,9 @@
 package bls24317
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index bf12818dd5..f77115cab8 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -17,7 +17,9 @@
 package bn254
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 38f438c71e..6946fe3b65 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -17,7 +17,9 @@
 package bw6633
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index ee8c765cd2..73217d1e6a 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -17,7 +17,9 @@
 package bw6756
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 5cab324216..e36993cfff 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -17,7 +17,9 @@
 package bw6761
 
 import (
+	rrand "crypto/rand"
 	"fmt"
+	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -28,6 +30,7 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -345,17 +348,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -669,22 +672,24 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
+		rr := samplePoints[start].X
+		rr.SetOne()
+		for i := start + 1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index f75da11996..25102e24d0 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -11,12 +11,15 @@ import (
 	"fmt"
     "time"
     "math/rand"
+	rrand "crypto/rand"
 	"math/big"
 	"testing"
     "runtime"
     "math/bits"
 	"sync"
+	"math"
 
+	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
 	"github.com/leanovate/gopter"
@@ -363,24 +366,26 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) {
-	var r big.Int
-	r.SetString("340444420969191673093399857471996460938405", 10)
-	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
-
-	one := samplePoints[0].X
-	one.SetOne()
-
-	for i := 1; i < len(samplePoints); i++ {
-		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
-		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
-	}
+	max := new(big.Int).SetInt64(math.MaxInt64)
+	parallel.Execute(len(samplePoints), func(start, end int) {
+		r, _ := rrand.Int(rrand.Reader, max)
+		samplePoints[start].ScalarMultiplication(&{{$.PointName}}GenAff, r)
+		rr :=  samplePoints[start].X
+		rr.SetOne()
+		for i := start+1; i < end; i++ {
+			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
+			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
+		}
+	})
 }
 
 {{end }}
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	for i := 0; i < len(sampleScalars); i++ {
-		sampleScalars[i].SetRandom()
-	}
+	parallel.Execute(len(sampleScalars), func(start, end int) {
+		for i := start; i < end; i++ {
+			sampleScalars[i].SetRandom()
+		}
+	})
 }

From 091d0d56443dec14cd2884a76f2158bb61f5714b Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 9 Nov 2022 13:49:55 -0600
Subject: [PATCH 11/43] test: add BatchAdd benchmark

---
 ecc/bls12-377/g1_test.go                      | 27 +++++++++++++++++++
 ecc/bls12-377/g2_test.go                      | 27 +++++++++++++++++++
 ecc/bls12-378/g1_test.go                      | 27 +++++++++++++++++++
 ecc/bls12-378/g2_test.go                      | 27 +++++++++++++++++++
 ecc/bls12-381/g1_test.go                      | 27 +++++++++++++++++++
 ecc/bls12-381/g2_test.go                      | 27 +++++++++++++++++++
 ecc/bls24-315/g1_test.go                      | 27 +++++++++++++++++++
 ecc/bls24-315/g2_test.go                      | 27 +++++++++++++++++++
 ecc/bls24-317/g1_test.go                      | 27 +++++++++++++++++++
 ecc/bls24-317/g2_test.go                      | 27 +++++++++++++++++++
 ecc/bn254/g1_test.go                          | 27 +++++++++++++++++++
 ecc/bn254/g2_test.go                          | 27 +++++++++++++++++++
 ecc/bw6-633/g1_test.go                        | 27 +++++++++++++++++++
 ecc/bw6-633/g2_test.go                        | 27 +++++++++++++++++++
 ecc/bw6-756/g1_test.go                        | 27 +++++++++++++++++++
 ecc/bw6-756/g2_test.go                        | 27 +++++++++++++++++++
 ecc/bw6-761/g1_test.go                        | 27 +++++++++++++++++++
 ecc/bw6-761/g2_test.go                        | 27 +++++++++++++++++++
 .../ecc/template/tests/point.go.tmpl          | 27 +++++++++++++++++++
 19 files changed, 513 insertions(+)

diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go
index 3209de0cd2..afb23458b1 100644
--- a/ecc/bls12-377/g1_test.go
+++ b/ecc/bls12-377/g1_test.go
@@ -19,6 +19,7 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go
index d3b0af12be..52e3ff41c1 100644
--- a/ecc/bls12-377/g2_test.go
+++ b/ecc/bls12-377/g2_test.go
@@ -19,6 +19,7 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower"
@@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go
index dccaca15c7..3859bb2695 100644
--- a/ecc/bls12-378/g1_test.go
+++ b/ecc/bls12-378/g1_test.go
@@ -19,6 +19,7 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go
index 21e79d7238..f81d14069b 100644
--- a/ecc/bls12-378/g2_test.go
+++ b/ecc/bls12-378/g2_test.go
@@ -19,6 +19,7 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower"
@@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go
index 9aa3311f05..ee4ce9fb21 100644
--- a/ecc/bls12-381/g1_test.go
+++ b/ecc/bls12-381/g1_test.go
@@ -19,6 +19,7 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go
index c259606622..a243b65b01 100644
--- a/ecc/bls12-381/g2_test.go
+++ b/ecc/bls12-381/g2_test.go
@@ -19,6 +19,7 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower"
@@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go
index 5eba73ee93..d1061a803e 100644
--- a/ecc/bls24-315/g1_test.go
+++ b/ecc/bls24-315/g1_test.go
@@ -19,6 +19,7 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go
index bab8fbad10..ccdac4012c 100644
--- a/ecc/bls24-315/g2_test.go
+++ b/ecc/bls24-315/g2_test.go
@@ -19,6 +19,7 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower"
@@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go
index 2c08510b14..3673290566 100644
--- a/ecc/bls24-317/g1_test.go
+++ b/ecc/bls24-317/g1_test.go
@@ -19,6 +19,7 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go
index 376b469347..74c8576f89 100644
--- a/ecc/bls24-317/g2_test.go
+++ b/ecc/bls24-317/g2_test.go
@@ -19,6 +19,7 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower"
@@ -505,6 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go
index 8ee025d787..c87502be96 100644
--- a/ecc/bn254/g1_test.go
+++ b/ecc/bn254/g1_test.go
@@ -19,6 +19,7 @@ package bn254
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
@@ -460,6 +461,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go
index 17c09d95ba..83d34fee91 100644
--- a/ecc/bn254/g2_test.go
+++ b/ecc/bn254/g2_test.go
@@ -19,6 +19,7 @@ package bn254
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower"
@@ -504,6 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go
index 6caf91227c..827cee65dd 100644
--- a/ecc/bw6-633/g1_test.go
+++ b/ecc/bw6-633/g1_test.go
@@ -19,6 +19,7 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go
index 32773e9718..82ddc5385b 100644
--- a/ecc/bw6-633/g2_test.go
+++ b/ecc/bw6-633/g2_test.go
@@ -19,6 +19,7 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go
index 81ecf81553..fc64f7646c 100644
--- a/ecc/bw6-756/g1_test.go
+++ b/ecc/bw6-756/g1_test.go
@@ -19,6 +19,7 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go
index ecfc973322..065dc4432e 100644
--- a/ecc/bw6-756/g2_test.go
+++ b/ecc/bw6-756/g2_test.go
@@ -19,6 +19,7 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go
index 6ace718ac2..3be460742f 100644
--- a/ecc/bw6-761/g1_test.go
+++ b/ecc/bw6-761/g1_test.go
@@ -19,6 +19,7 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -499,6 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG1Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G1Affine
+	var RR [MAX_BATCH_SIZE]*G1Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG1Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go
index 9630dbf178..0268875661 100644
--- a/ecc/bw6-761/g2_test.go
+++ b/ecc/bw6-761/g2_test.go
@@ -19,6 +19,7 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -486,6 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAddG2Affine(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]G2Affine
+	var RR [MAX_BATCH_SIZE]*G2Affine
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
+
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAddG2Affine(RR[:], P[:])
+	}
+
+}
+
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element
diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl
index 556d9befc3..e033f96dd7 100644
--- a/internal/generator/ecc/template/tests/point.go.tmpl
+++ b/internal/generator/ecc/template/tests/point.go.tmpl
@@ -16,6 +16,7 @@ import (
 	"fmt"
 	"math/big"
 	"testing"
+	"math/rand"
 
 	{{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}}
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
@@ -559,6 +560,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) {
 
 }
 
+func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
+	var P, R [MAX_BATCH_SIZE]{{ $TAffine }}
+	var RR [MAX_BATCH_SIZE]*{{ $TAffine }}
+	var ridx [MAX_BATCH_SIZE]int
+
+	fillBenchBases{{ toUpper $.PointName }}(P[:])
+	fillBenchBases{{ toUpper $.PointName }}(R[:])
+
+	for i:=0; i < len(ridx);i++ {
+		ridx[i] = i
+	}
+
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BatchAdd{{ $TAffine }}(RR[:], P[:])
+	}
+
+}
+
 func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
 	var mixer fr.Element

From f4b4eea58c4e039931694038ffc2a49fd54e95ba Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 9 Nov 2022 15:22:08 -0600
Subject: [PATCH 12/43] feat: add bitset to do quick bucket presence check in
 batch

---
 ecc/bls12-377/multiexp.go                     | 28 +++---
 ecc/bls12-377/multiexp_affine.go              | 86 ++++++++++++-------
 ecc/bls12-378/multiexp.go                     | 28 +++---
 ecc/bls12-378/multiexp_affine.go              | 86 ++++++++++++-------
 ecc/bls12-381/multiexp.go                     | 28 +++---
 ecc/bls12-381/multiexp_affine.go              | 86 ++++++++++++-------
 ecc/bls24-315/multiexp.go                     | 28 +++---
 ecc/bls24-315/multiexp_affine.go              | 86 ++++++++++++-------
 ecc/bls24-317/multiexp.go                     | 28 +++---
 ecc/bls24-317/multiexp_affine.go              | 86 ++++++++++++-------
 ecc/bn254/multiexp.go                         | 28 +++---
 ecc/bn254/multiexp_affine.go                  | 86 ++++++++++++-------
 ecc/bw6-633/multiexp.go                       |  4 +-
 ecc/bw6-633/multiexp_affine.go                | 68 ++++++++-------
 ecc/bw6-756/multiexp.go                       |  4 +-
 ecc/bw6-756/multiexp_affine.go                | 68 ++++++++-------
 ecc/bw6-761/multiexp.go                       |  4 +-
 ecc/bw6-761/multiexp_affine.go                | 68 ++++++++-------
 .../generator/ecc/template/multiexp.go.tmpl   |  4 +-
 .../ecc/template/multiexp_affine.go.tmpl      | 36 ++++----
 20 files changed, 545 insertions(+), 395 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 98de6ca242..24e4fe5ee0 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 66c9d4e8bf..33cbf3844b 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 917f493796..3f79b12596 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index a48d9d1cfd..b631d13a72 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 8283ce4957..5bd22872c9 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index d30c10293e..bf65dc9aa1 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 922c80cd89..81dc28638c 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index c7aa56e2d5..50a7c5613a 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 923946e34f..56342df6d4 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index ccd70a9474..82ec92b92f 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 410e4016ab..64b2883493 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -172,31 +172,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -418,31 +418,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 62c70e876a..b750572b22 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -249,7 +245,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -262,6 +258,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -269,15 +267,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -289,9 +286,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -326,8 +322,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -337,7 +332,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -400,9 +394,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -459,3 +451,33 @@ type ibG2Affine interface {
 		bucketG2AffineC15 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC6 [1 << (6 - 1)]bool
+type bitSetC7 [1 << (7 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC9 [1 << (9 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
+type bitSetC13 [1 << (13 - 1)]bool
+type bitSetC14 [1 << (14 - 1)]bool
+type bitSetC15 [1 << (15 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC6 |
+		bitSetC7 |
+		bitSetC8 |
+		bitSetC9 |
+		bitSetC10 |
+		bitSetC11 |
+		bitSetC12 |
+		bitSetC13 |
+		bitSetC14 |
+		bitSetC15 |
+		bitSetC16
+}
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index b8fdb1314f..762cecb6c9 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -159,7 +159,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -368,7 +368,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 1a7d1b4abe..12762830f9 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -231,7 +227,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -423,3 +415,15 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC8 |
+		bitSetC16
+}
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 98b3867477..649b86facf 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -160,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -370,7 +370,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 93b394e246..e4748e2c8b 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -231,7 +227,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -423,3 +415,15 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC8 |
+		bitSetC16
+}
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 9a41a9176f..9eb52c0130 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -160,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -370,7 +370,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstC
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index cdd2c92daf..167bdf2902 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -32,7 +32,7 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -45,6 +45,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -52,15 +54,14 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G1Affine // ...
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -72,9 +73,8 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			return
 		}
 		BatchAddG1Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -109,8 +109,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -120,7 +119,6 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -183,9 +181,7 @@ func processChunkG1BatchAffine[B ibG1Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -231,7 +227,7 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -244,6 +240,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -251,15 +249,14 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize)
-	cptP := 0 // count the number of point added to current batch
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
+	cptP := 0        // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // allocated on the stack
-	var R [MAX_BATCH_SIZE]*G2Affine // ...
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -271,9 +268,8 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			return
 		}
 		BatchAddG2Affine(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -308,8 +304,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			}
 		}
 
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -319,7 +314,6 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 		cptP++
 	}
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -382,9 +376,7 @@ func processChunkG2BatchAffine[B ibG2Affine](chunk uint64,
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -423,3 +415,15 @@ type ibG2Affine interface {
 		bucketG2AffineC8 |
 		bucketG2AffineC16
 }
+
+type bitSetC4 [1 << (4 - 1)]bool
+type bitSetC5 [1 << (5 - 1)]bool
+type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC16 [1 << (16 - 1)]bool
+
+type bitSet interface {
+	bitSetC4 |
+		bitSetC5 |
+		bitSetC8 |
+		bitSetC16
+}
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 6dca2eb861..9e5ce00b70 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -427,7 +427,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 		{{- if le $c 9}}
 			processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 		{{- else}}
-			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}]
+			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}]
 		{{- end}}
 		{{- if eq $c $lc}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk)
@@ -435,7 +435,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
-				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}]
+				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}]
 			{{- end}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 897401430f..16071451e5 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -33,7 +33,7 @@ func (o batchOp) isNeg() bool {
 // 
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64,
+func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](chunk uint64,
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
@@ -46,6 +46,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 
 	// setup for the batch affine;
+	// we do that instead of a separate object to give enough hints to the compiler to..
+	// keep things on the stack.
 	batchSize := len(buckets) / 20
 	if batchSize > MAX_BATCH_SIZE {
 		batchSize = MAX_BATCH_SIZE
@@ -53,15 +55,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	if batchSize <= 0 {
 		batchSize = 1
 	}
-	bucketIds := make(map[uint32]struct{}, batchSize) 
+	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptP := 0 // count the number of point added to current batch
 
-	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // allocated on the stack
-	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // ... 
+	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets)
+	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references
 
 	canAdd := func(bID uint32) bool {
-		_, ok := bucketIds[bID]
-		return !ok
+		return !bucketIds[bID]
 	}
 
 	isFull := func() bool {
@@ -73,9 +74,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 			return
 		}
 		BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP])
-		for k := range bucketIds {
-			delete(bucketIds, k)
-		}
+		var tmp BS
+		bucketIds = tmp
 		cptP = 0
 	}
 
@@ -110,8 +110,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 			}
 		}
 	
-		// bucketIds[cptP] = op.bucketID
-		bucketIds[op.bucketID] = struct{}{}
+		bucketIds[op.bucketID] = true //struct{}{}
 		R[cptP] = BK
 		if op.isNeg() {
 			P[cptP].Neg(PP)
@@ -122,7 +121,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 	}
 	
 
-	// queue := make([]batchOp, 0, 20 * batchSize) // TODO find right capacity here.
 	var queue [MAX_BATCH_SIZE]batchOp
 	qID := 0
 
@@ -185,9 +183,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}](chunk uint64
 			// queue = append(queue, op)
 		}
 	}
-	// fmt.Printf("chunk %d\nlen(queue)=%d\nnbBatches=%d\nbatchSize=%d\nnbBuckets=%d\nnbPoints=%d\n\n",
-	// 	chunk, len(queue), nbBatches, batchSize, len(buckets), len(points))
-	// executeAndReset()
+
 	for qID != 0 {
 		processQueue()
 		executeAndReset() // execute batch even if not full.
@@ -226,3 +222,13 @@ type ib{{ $.TAffine }} interface {
 }
 
 {{end }}
+
+{{- range $c :=  $.G1.CRange}}
+type bitSetC{{$c}} [1<<({{$c}}-1)]bool
+{{- end}}
+
+type bitSet interface {
+	{{- range $i, $c :=  $.G1.CRange}}
+	bitSetC{{$c}} {{- if not (last $i $.G1.CRange)}} | {{- end}}
+	{{- end}}
+}

From b75ae0976bdc864c5d459d2b10aa43f3fbd7363d Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Thu, 10 Nov 2022 15:52:33 -0600
Subject: [PATCH 13/43] feat: restored split msm logic

---
 ecc/bls12-377/multiexp.go                     | 128 +++++++++---------
 ecc/bls12-377/multiexp_test.go                | 108 ++-------------
 ecc/bls12-378/multiexp.go                     | 128 +++++++++---------
 ecc/bls12-378/multiexp_test.go                | 108 ++-------------
 ecc/bls12-381/multiexp.go                     | 128 +++++++++---------
 ecc/bls12-381/multiexp_test.go                | 108 ++-------------
 ecc/bls24-315/multiexp.go                     | 128 +++++++++---------
 ecc/bls24-315/multiexp_test.go                | 108 ++-------------
 ecc/bls24-317/multiexp.go                     | 128 +++++++++---------
 ecc/bls24-317/multiexp_test.go                | 108 ++-------------
 ecc/bn254/multiexp.go                         | 128 +++++++++---------
 ecc/bn254/multiexp_test.go                    | 108 ++-------------
 ecc/bw6-633/multiexp.go                       | 128 +++++++++---------
 ecc/bw6-633/multiexp_test.go                  | 108 ++-------------
 ecc/bw6-756/multiexp.go                       | 128 +++++++++---------
 ecc/bw6-756/multiexp_test.go                  | 108 ++-------------
 ecc/bw6-761/multiexp.go                       | 128 +++++++++---------
 ecc/bw6-761/multiexp_test.go                  | 108 ++-------------
 .../generator/ecc/template/multiexp.go.tmpl   |  72 +++++-----
 .../ecc/template/tests/multiexp.go.tmpl       |  57 +-------
 20 files changed, 727 insertions(+), 1526 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 24e4fe5ee0..3c7e9fa3e4 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index e9e7fd9c67..3da962b555 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 3f79b12596..2371d32a5c 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 65729bd9c1..1cf20b793f 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 5bd22872c9..e13ca90588 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 4b357be4d9..b44ec363a2 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 81dc28638c..d73bb2783e 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 0773215145..8da7433fa9 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 56342df6d4..ceee16e7fd 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 9ebe0c4217..3d99991dc1 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 64b2883493..1167251103 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -345,54 +347,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index f77115cab8..e360f2f20f 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 762cecb6c9..3ca13bafc4 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -308,54 +310,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 6946fe3b65..d4e07c80ce 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 649b86facf..87598422a7 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -309,54 +311,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 73217d1e6a..64f9ca10ec 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 9eb52c0130..d5165db4a5 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -99,54 +99,56 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G1Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG1(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG1(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G1Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG1(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG1(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
@@ -309,54 +311,56 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
 	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%C != 0 {
 		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints / 2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit * 2
+		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
+			config.NbTasks /= 2
+			var _p G2Jac
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsmG2(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsmG2(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-
-	// _p := make([]G2Jac, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsmG2(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsmG2(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, digits []uint32, splitFirstChunk bool) {
 	switch c {
 
 	case 4:
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index e36993cfff..613a4bac53 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -94,8 +94,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG1(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -130,14 +129,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,14 +165,9 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G1Jac, len(cRange)+1)
+			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG1(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -190,39 +179,6 @@ func TestMultiExpG1(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G1] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G1Affine
-			var g G1Jac
-			g.Set(&g1Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g1Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G1Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG1(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG1(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G1] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
@@ -420,8 +376,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsmG2(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -454,14 +409,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -495,14 +445,9 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]G2Jac, len(cRange)+1)
+			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsmG2(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -514,39 +459,6 @@ func TestMultiExpG2(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[G2] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]G2Affine
-			var g G2Jac
-			g.Set(&g2Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&g2Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-			var result1, result2 G2Jac
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsmG2(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsmG2(&result2, int(c), samplePoints[:], scalars, false)
-				if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[G2] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 9e5ce00b70..67ed6b9027 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -368,56 +368,58 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 				C = c
 			}
 		}
-		// empirical, needs to be tuned.
-		// if C > 16 && nbPoints < 1 << 23 {
-		// 	C = 16
-		// }
 		return C
 	}
 
-	// TODO @gbotrel restore split by calling outterMsm BEFORE partitioning scalars.
-	// nbSplits := 1
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C)  // number of c-bit radixes in a scalar
-	if (fr.Limbs * 64) % C != 0 {
-		nbChunks ++
+	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
+	if (fr.Limbs*64)%C != 0 {
+		nbChunks++
 	}
+	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
+	if config.NbTasks > 1 && nbChunks < config.NbTasks {
+		// before spliting, let's see if we endup with more tasks than thread;
+		cSplit := bestC(nbPoints/2)
+		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
+		if (fr.Limbs*64)%cSplit != 0 {
+			nbChunksPostSplit++
+		}
+		nbTasksPostSplit := nbChunksPostSplit*2
+		if (nbTasksPostSplit <= config.NbTasks) || ( nbTasksPostSplit - config.NbTasks ) <= ( config.NbTasks - nbChunks) {
+			// if postSplit we still have less tasks than available CPU
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split. 
+			config.NbTasks /= 2
+			var _p {{ $.TJacobian }}
+			chDone := make(chan struct{}, 1)
+			go func() {
+				innerMsm{{ $.UPointName }}(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				close(chDone)
+			}()
+			innerMsm{{ $.UPointName }}(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			<-chDone
+			p.AddAssign(&_p)
+			return p, nil
+		}
+	}
+
+	innerMsm{{ $.UPointName }}(p, int(C), points, scalars, config)
+
+	return p, nil
+}
+
+
+func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig)  {
 
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int 
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-	innerMsm{{ $.UPointName }}(p, int(C), points, digits, splitFirstChunk)
-	// we have nbSplits intermediate results that we must sum together.
-	
-
-	// _p := make([]{{ $.TJacobian }}, nbSplits - 1)
-	// chDone := make(chan int, nbSplits - 1)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	start := i * nbPoints
-	// 	end := start + nbPoints
-	// 	go func(start, end, i int) {
-	// 		innerMsm{{ $.UPointName }}(&_p[i], int(C), points[start:end], scalars[start:end], splitFirstChunk)
-	// 		chDone <- i
-	// 	}(start, end, i)
-	// }
-
-	// innerMsm{{ $.UPointName }}(p, int(C), points[(nbSplits - 1) * nbPoints:], scalars[(nbSplits - 1) * nbPoints:], splitFirstChunk)
-	// for i:=0; i < nbSplits-1; i++ {
-	// 	done := <-chDone
-	// 	p.AddAssign(&_p[done])
-	// }
-	// close(chDone)
-	return p, nil
-}
-
 
-func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool)  {
 	{{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}}
 	{{- /* also need to determine until which window size the ext-jacobian version is worth it. */}}
 	switch c {
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 25102e24d0..e6cd2b0338 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -10,11 +10,11 @@
 import (
 	"fmt"
     "time"
+	"runtime"
     "math/rand"
 	rrand "crypto/rand"
 	"math/big"
 	"testing"
-    "runtime"
     "math/bits"
 	"sync"
 	"math"
@@ -93,8 +93,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 					FromMont()
 			}
 
-			scalars16, _ := partitionScalars(sampleScalars[:], 16, false, runtime.NumCPU())
-			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], scalars16, true)
+			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
 
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
@@ -138,14 +137,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 
 
-			results := make([]{{ $.TJacobian }}, len(cRange) + 1)
+			results := make([]{{ $.TJacobian }}, len(cRange))
 			for i, c := range cRange {
-				scalars, _ :=  partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePoints[:], scalars, true)
-				}
+				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
 			}
 			for i:=1; i < len(results);i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -179,14 +173,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 					FromMont()
 			}
 
-			results := make([]{{ $.TJacobian }}, len(cRange)+1)
+			results := make([]{{ $.TJacobian }}, len(cRange))
 			for i, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], scalars, false)
-				if c == 16 {
-					// split the first chunk
-					innerMsm{{ toUpper $.PointName }}(&results[len(results)-1], 16, samplePointsZero[:], scalars, true)
-				}
+				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -199,40 +188,6 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	))
 
 
-	properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] MultiExp and MultiExpBatchAffine (c in %v) should output the same result", cRange), prop.ForAll(
-		func(mixer fr.Element) bool {
-			// multi exp points
-			var samplePoints [nbSamples]{{ $.TAffine }}
-			var g {{ $.TJacobian }}
-			g.Set(&{{ toLower .PointName}}Gen)
-			for i := 1; i <= nbSamples; i++ {
-				samplePoints[i-1].FromJacobian(&g)
-				g.AddAssign(&{{ toLower .PointName}}Gen)
-			}
-			// mixer ensures that all the words of a fpElement are set
-			var sampleScalars [nbSamples]fr.Element
-
-			for i := 1; i <= nbSamples; i++ {
-				sampleScalars[i-1].SetUint64(uint64(i)).
-					Mul(&sampleScalars[i-1], &mixer).
-					FromMont()
-			}
-
-            var result1, result2 {{ $.TJacobian }}
-			for _, c := range cRange {
-				scalars, _ := partitionScalars(sampleScalars[:], c, false, runtime.NumCPU())
-				innerMsm{{ toUpper $.PointName }}(&result1, int(c), samplePoints[:], scalars, false)
-				innerMsm{{ toUpper $.PointName }}(&result2, int(c), samplePoints[:], scalars, false)
-                if !result1.Equal(&result2) {
-					return false
-				}
-			}
-			return true
-		},
-		genScalar,
-	))
-
-
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
 	properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(

From 227a8f27c68f5ebadcf99aa359b7fc761d4830e8 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Fri, 11 Nov 2022 15:57:17 +0000
Subject: [PATCH 14/43] fix: restore previous way to generate scalars in
 benches

---
 ecc/bls12-377/multiexp_test.go                | 63 +++++++++----------
 ecc/bls12-378/multiexp_test.go                | 63 +++++++++----------
 ecc/bls12-381/multiexp_test.go                | 63 +++++++++----------
 ecc/bls24-315/multiexp_test.go                | 63 +++++++++----------
 ecc/bls24-317/multiexp_test.go                | 63 +++++++++----------
 ecc/bn254/multiexp_test.go                    | 63 +++++++++----------
 ecc/bw6-633/multiexp_test.go                  | 63 +++++++++----------
 ecc/bw6-756/multiexp_test.go                  | 63 +++++++++----------
 ecc/bw6-761/multiexp_test.go                  | 63 +++++++++----------
 .../ecc/template/tests/multiexp.go.tmpl       | 41 ++++++------
 10 files changed, 300 insertions(+), 308 deletions(-)

diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 3da962b555..4b4406e922 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -17,9 +17,7 @@
 package bls12377
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 1cf20b793f..b710acf39b 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -17,9 +17,7 @@
 package bls12378
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index b44ec363a2..1f8539c0bf 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -17,9 +17,7 @@
 package bls12381
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 8da7433fa9..9307ba079d 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -17,9 +17,7 @@
 package bls24315
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 3d99991dc1..5945e42e8a 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -17,9 +17,7 @@
 package bls24317
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index e360f2f20f..23dc3b5897 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -17,9 +17,7 @@
 package bn254
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index d4e07c80ce..4c40debed6 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -17,9 +17,7 @@
 package bw6633
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 64f9ca10ec..d79044f69c 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -17,9 +17,7 @@
 package bw6756
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 613a4bac53..2dcc22a913 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -17,9 +17,7 @@
 package bw6761
 
 import (
-	rrand "crypto/rand"
 	"fmt"
-	"math"
 	"math/big"
 	"math/bits"
 	"math/rand"
@@ -30,7 +28,6 @@ import (
 
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/leanovate/gopter"
 	"github.com/leanovate/gopter/prop"
 )
@@ -247,7 +244,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 }
 
 func BenchmarkMultiExpG1Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G1Affine
@@ -304,17 +301,17 @@ func BenchmarkManyMultiExpG1Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG1(samplePoints []G1Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g1GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func TestMultiExpG2(t *testing.T) {
@@ -527,7 +524,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 }
 
 func BenchmarkMultiExpG2Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints  [nbSamples]G2Affine
@@ -584,24 +581,26 @@ func BenchmarkManyMultiExpG2Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBasesG2(samplePoints []G2Affine) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&g2GenAff, r)
-		rr := samplePoints[start].X
-		rr.SetOne()
-		for i := start + 1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index e6cd2b0338..070481bf7b 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -12,14 +12,11 @@ import (
     "time"
 	"runtime"
     "math/rand"
-	rrand "crypto/rand"
 	"math/big"
 	"testing"
     "math/bits"
 	"sync"
-	"math"
 
-	"github.com/consensys/gnark-crypto/internal/parallel"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fr"
 	"github.com/leanovate/gopter"
@@ -262,7 +259,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 
 func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
-	const nbSamples = 1 << 23
+	const nbSamples = 1 << 20
 
 	var (
 		samplePoints [nbSamples]{{ $.TAffine }}
@@ -321,26 +318,30 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
 func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) {
-	max := new(big.Int).SetInt64(math.MaxInt64)
-	parallel.Execute(len(samplePoints), func(start, end int) {
-		r, _ := rrand.Int(rrand.Reader, max)
-		samplePoints[start].ScalarMultiplication(&{{$.PointName}}GenAff, r)
-		rr :=  samplePoints[start].X
-		rr.SetOne()
-		for i := start+1; i < end; i++ {
-			samplePoints[i].X.Add(&samplePoints[i-1].X, &rr)
-			samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &rr)
-		}
-	})
+	var r big.Int
+	r.SetString("340444420969191673093399857471996460938405", 10)
+	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)
+
+	one := samplePoints[0].X
+	one.SetOne()
+
+	for i := 1; i < len(samplePoints); i++ {
+		samplePoints[i].X.Add(&samplePoints[i-1].X, &one)
+		samplePoints[i].Y.Sub(&samplePoints[i-1].Y, &one)
+	}
 }
 
+
 {{end }}
 
+
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	parallel.Execute(len(sampleScalars), func(start, end int) {
-		for i := start; i < end; i++ {
-			sampleScalars[i].SetRandom()
-		}
-	})
+	var mixer fr.Element
+	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
+	for i := 1; i <= len(sampleScalars); i++ {
+		sampleScalars[i-1].SetUint64(uint64(i)).
+			Mul(&sampleScalars[i-1], &mixer).
+			FromMont()
+	}
 }

From f973cf4bb1fcf6abc7a4ea3db4b12dc584f79c80 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Fri, 11 Nov 2022 16:04:19 +0000
Subject: [PATCH 15/43] fix: fix splitting logic in msm

---
 ecc/bls12-377/multiexp.go                        | 12 ++++++------
 ecc/bls12-378/multiexp.go                        | 12 ++++++------
 ecc/bls12-381/multiexp.go                        | 12 ++++++------
 ecc/bls24-315/multiexp.go                        | 12 ++++++------
 ecc/bls24-317/multiexp.go                        | 12 ++++++------
 ecc/bn254/multiexp.go                            | 12 ++++++------
 ecc/bw6-633/multiexp.go                          | 12 ++++++------
 ecc/bw6-756/multiexp.go                          | 12 ++++++------
 ecc/bw6-761/multiexp.go                          | 12 ++++++------
 internal/generator/ecc/template/multiexp.go.tmpl |  6 +++---
 10 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 3c7e9fa3e4..53a1823e0d 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 2371d32a5c..73e162f80d 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index e13ca90588..80ff8bfc30 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index d73bb2783e..f61ab96f3d 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index ceee16e7fd..8b81840e50 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 1167251103..ac979ddff7 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -364,17 +364,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 3ca13bafc4..23c35d3d90 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -327,17 +327,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 87598422a7..0124b603f4 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -328,17 +328,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index d5165db4a5..fc2c7c4908 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -116,17 +116,17 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G1Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG1(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG1(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
@@ -328,17 +328,17 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
-		if (nbTasksPostSplit <= config.NbTasks) || (nbTasksPostSplit-config.NbTasks) <= (config.NbTasks-nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p G2Jac
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsmG2(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsmG2(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 67ed6b9027..5940c0e8c5 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -385,17 +385,17 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit*2
-		if (nbTasksPostSplit <= config.NbTasks) || ( nbTasksPostSplit - config.NbTasks ) <= ( config.NbTasks - nbChunks) {
+		if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) {
 			// if postSplit we still have less tasks than available CPU
 			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split. 
 			config.NbTasks /= 2
 			var _p {{ $.TJacobian }}
 			chDone := make(chan struct{}, 1)
 			go func() {
-				innerMsm{{ $.UPointName }}(&_p, int(cSplit), points[:nbPoints/2], scalars[:nbPoints/2], config)
+				_p.MultiExp(points[:nbPoints/2], scalars[:nbPoints/2], config)
 				close(chDone)
 			}()
-			innerMsm{{ $.UPointName }}(p, int(cSplit), points[nbPoints/2:], scalars[nbPoints/2:], config)
+			p.MultiExp(points[nbPoints/2:], scalars[nbPoints/2:], config)
 			<-chDone
 			p.AddAssign(&_p)
 			return p, nil

From 85e6ea0cbb23c22ee1b6499b2ce37444c9f50005 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 14 Nov 2022 11:52:42 -0600
Subject: [PATCH 16/43] feat: store neg(P) and P in opposite sides of batch add
 input slice

---
 ecc/bls12-377/g1.go                           |  65 +++++++----
 ecc/bls12-377/g1_test.go                      |   5 +-
 ecc/bls12-377/g2.go                           |  65 +++++++----
 ecc/bls12-377/g2_test.go                      |   5 +-
 ecc/bls12-377/multiexp_affine.go              | 106 +++++++++++-------
 ecc/bls12-378/g1.go                           |  65 +++++++----
 ecc/bls12-378/g1_test.go                      |   5 +-
 ecc/bls12-378/g2.go                           |  65 +++++++----
 ecc/bls12-378/g2_test.go                      |   5 +-
 ecc/bls12-378/multiexp_affine.go              | 106 +++++++++++-------
 ecc/bls12-381/g1.go                           |  65 +++++++----
 ecc/bls12-381/g1_test.go                      |   5 +-
 ecc/bls12-381/g2.go                           |  65 +++++++----
 ecc/bls12-381/g2_test.go                      |   5 +-
 ecc/bls12-381/multiexp_affine.go              | 106 +++++++++++-------
 ecc/bls24-315/g1.go                           |  65 +++++++----
 ecc/bls24-315/g1_test.go                      |   5 +-
 ecc/bls24-315/g2.go                           |  65 +++++++----
 ecc/bls24-315/g2_test.go                      |   5 +-
 ecc/bls24-315/multiexp_affine.go              | 106 +++++++++++-------
 ecc/bls24-317/g1.go                           |  65 +++++++----
 ecc/bls24-317/g1_test.go                      |   5 +-
 ecc/bls24-317/g2.go                           |  65 +++++++----
 ecc/bls24-317/g2_test.go                      |   5 +-
 ecc/bls24-317/multiexp_affine.go              | 106 +++++++++++-------
 ecc/bn254/g1.go                               |  65 +++++++----
 ecc/bn254/g1_test.go                          |   5 +-
 ecc/bn254/g2.go                               |  65 +++++++----
 ecc/bn254/g2_test.go                          |   5 +-
 ecc/bn254/multiexp_affine.go                  | 106 +++++++++++-------
 ecc/bw6-633/g1.go                             |  65 +++++++----
 ecc/bw6-633/g1_test.go                        |   5 +-
 ecc/bw6-633/g2.go                             |  65 +++++++----
 ecc/bw6-633/g2_test.go                        |   5 +-
 ecc/bw6-633/multiexp_affine.go                | 106 +++++++++++-------
 ecc/bw6-756/g1.go                             |  65 +++++++----
 ecc/bw6-756/g1_test.go                        |   5 +-
 ecc/bw6-756/g2.go                             |  65 +++++++----
 ecc/bw6-756/g2_test.go                        |   5 +-
 ecc/bw6-756/multiexp_affine.go                | 106 +++++++++++-------
 ecc/bw6-761/g1.go                             |  65 +++++++----
 ecc/bw6-761/g1_test.go                        |   5 +-
 ecc/bw6-761/g2.go                             |  65 +++++++----
 ecc/bw6-761/g2_test.go                        |   5 +-
 ecc/bw6-761/multiexp_affine.go                | 106 +++++++++++-------
 .../ecc/template/multiexp_affine.go.tmpl      |  54 +++++----
 internal/generator/ecc/template/point.go.tmpl |  65 +++++++----
 .../ecc/template/tests/point.go.tmpl          |   5 +-
 48 files changed, 1502 insertions(+), 836 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 962a527a2e..3602b14992 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go
index afb23458b1..105a2d0a9a 100644
--- a/ecc/bls12-377/g1_test.go
+++ b/ecc/bls12-377/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index dd09808a13..04c0f5fac5 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -976,25 +976,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1003,17 +1007,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E2
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1024,6 +1022,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go
index 52e3ff41c1..9048e32439 100644
--- a/ecc/bls12-377/g2_test.go
+++ b/ecc/bls12-377/g2_test.go
@@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 33cbf3844b..c8b2686337 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 8422e95efb..d36be41445 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go
index 3859bb2695..a3603c49cb 100644
--- a/ecc/bls12-378/g1_test.go
+++ b/ecc/bls12-378/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 9cca73e6b3..9803f61512 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -976,25 +976,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1003,17 +1007,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E2
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1024,6 +1022,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go
index f81d14069b..ffe94dbc2e 100644
--- a/ecc/bls12-378/g2_test.go
+++ b/ecc/bls12-378/g2_test.go
@@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index b631d13a72..b30717ffea 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index bb37dacb65..b05d04acc8 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -980,25 +980,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1007,17 +1011,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1028,6 +1026,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go
index ee4ce9fb21..68a84cf073 100644
--- a/ecc/bls12-381/g1_test.go
+++ b/ecc/bls12-381/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 86ce9db5b6..c69c7f0444 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -977,25 +977,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1004,17 +1008,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E2
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1025,6 +1023,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go
index a243b65b01..129a541689 100644
--- a/ecc/bls12-381/g2_test.go
+++ b/ecc/bls12-381/g2_test.go
@@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index bf65dc9aa1..a2a7eb8ffb 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index e55d4ad4cb..86e394a710 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -982,25 +982,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1009,17 +1013,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1030,6 +1028,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go
index d1061a803e..d3840c2537 100644
--- a/ecc/bls24-315/g1_test.go
+++ b/ecc/bls24-315/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index f5dffd0752..6170c188b6 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -992,25 +992,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1019,17 +1023,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E4
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1040,6 +1038,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go
index ccdac4012c..2e97c208c0 100644
--- a/ecc/bls24-315/g2_test.go
+++ b/ecc/bls24-315/g2_test.go
@@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 50a7c5613a..5db551830d 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index 58bee14819..1d4e27c062 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -982,25 +982,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1009,17 +1013,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1030,6 +1028,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go
index 3673290566..59fd0c425f 100644
--- a/ecc/bls24-317/g1_test.go
+++ b/ecc/bls24-317/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index f5fb993fb4..bbfcfd12b2 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -992,25 +992,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1019,17 +1023,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E4
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1040,6 +1038,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go
index 74c8576f89..f02c85f79d 100644
--- a/ecc/bls24-317/g2_test.go
+++ b/ecc/bls24-317/g2_test.go
@@ -508,7 +508,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -523,11 +523,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 82ec92b92f..b517f1ef32 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 80cec53604..6f9a4d4e59 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -952,25 +952,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -979,17 +983,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1000,6 +998,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go
index c87502be96..ffc6160b01 100644
--- a/ecc/bn254/g1_test.go
+++ b/ecc/bn254/g1_test.go
@@ -463,7 +463,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -478,11 +478,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 79215583d5..762a6f944b 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -981,25 +981,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1008,17 +1012,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fptower.E2
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1029,6 +1027,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go
index 83d34fee91..9d0b38a11a 100644
--- a/ecc/bn254/g2_test.go
+++ b/ecc/bn254/g2_test.go
@@ -507,7 +507,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -522,11 +522,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index b750572b22..48b5e6e242 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -268,9 +283,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -278,17 +294,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -308,28 +326,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index 41a18cf2af..9e61e67732 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1084,25 +1084,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1111,17 +1115,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1132,6 +1130,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go
index 827cee65dd..91e28e75e0 100644
--- a/ecc/bw6-633/g1_test.go
+++ b/ecc/bw6-633/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index de70170a12..6f021168b5 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -947,25 +947,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -974,17 +978,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fp.Element
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -995,6 +993,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go
index 82ddc5385b..a51ae94c50 100644
--- a/ecc/bw6-633/g2_test.go
+++ b/ecc/bw6-633/g2_test.go
@@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 12762830f9..74c9b3d4dc 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index e1c7e9056a..fee3c6884b 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1084,25 +1084,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1111,17 +1115,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1132,6 +1130,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go
index fc64f7646c..cfc93383c9 100644
--- a/ecc/bw6-756/g1_test.go
+++ b/ecc/bw6-756/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 5302819c4b..195322273e 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -941,25 +941,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -968,17 +972,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fp.Element
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -989,6 +987,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go
index 065dc4432e..699df087ed 100644
--- a/ecc/bw6-756/g2_test.go
+++ b/ecc/bw6-756/g2_test.go
@@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index e4748e2c8b..02bc11523c 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 86b99ebd1a..3537495b46 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1095,25 +1095,29 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1122,17 +1126,11 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	var d fp.Element
 	var rr G1Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1143,6 +1141,27 @@ func BatchAddG1Affine(R []*G1Affine, P []G1Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go
index 3be460742f..5b1b389102 100644
--- a/ecc/bw6-761/g1_test.go
+++ b/ecc/bw6-761/g1_test.go
@@ -502,7 +502,7 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG1Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR [MAX_BATCH_SIZE]*G1Affine
+	var RR, PP [MAX_BATCH_SIZE]*G1Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG1(P[:])
@@ -517,11 +517,12 @@ func BenchmarkBatchAddG1Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG1Affine(RR[:], P[:])
+		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 77c4e1d375..41cfea623f 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -955,25 +955,29 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i := len(R) - cptSub; i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -982,17 +986,11 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	var d fp.Element
 	var rr G2Affine
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1003,6 +1001,27 @@ func BatchAddG2Affine(R []*G2Affine, P []G2Affine) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j + offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go
index 0268875661..76d8b7f7de 100644
--- a/ecc/bw6-761/g2_test.go
+++ b/ecc/bw6-761/g2_test.go
@@ -489,7 +489,7 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAddG2Affine(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR [MAX_BATCH_SIZE]*G2Affine
+	var RR, PP [MAX_BATCH_SIZE]*G2Affine
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBasesG2(P[:])
@@ -504,11 +504,12 @@ func BenchmarkBatchAddG2Affine(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAddG2Affine(RR[:], P[:])
+		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
 	}
 
 }
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 167bdf2902..4c9c97691d 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -55,9 +55,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -65,17 +66,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG1Affine(R[:cptP], P[:cptP])
+		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -95,28 +98,40 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -250,9 +265,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0        // count the number of point added to current batch
+	cptAdd := 0      // count the number of bucket + point added to current batch
+	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -260,17 +276,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd + cptSub) == batchSize
 	}
 
 	executeAndReset := func() {
-		if cptP == 0 {
+		if (cptAdd + cptSub) == 0 {
 			return
 		}
-		BatchAddG2Affine(R[:cptP], P[:cptP])
+		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -290,28 +308,40 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling .
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
 
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize-cptSub] = BK
+			P[batchSize-cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 16071451e5..8902aeb919 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -56,9 +56,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 		batchSize = 1
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptP := 0 // count the number of point added to current batch
+	cptAdd := 0 // count the number of bucket + point added to current batch
+	cptSub := 0 // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]*{{ $.TAffine }} // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,17 +67,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 	}
 
 	isFull := func() bool {
-		return cptP == batchSize
+		return (cptAdd+cptSub) == batchSize
 	}
 
 	executeAndReset := func ()  {
-		if cptP == 0 {
+		if (cptAdd+cptSub) == 0 {
 			return
 		}
-		BatchAdd{{ $.TAffine }}(R[:cptP], P[:cptP])
+		batchAdd{{ $.TAffine }}(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		
 		var tmp BS
 		bucketIds = tmp
-		cptP = 0
+		cptAdd = 0
+		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -96,28 +99,41 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 			}
 			return
 		}
-		if op.isNeg() {
-			// if bucket == P --> -P == 0
-			if BK.Equal(PP) {
-				BK.setInfinity()
+		if BK.X.Equal(&PP.X) {
+			if BK.Y.Equal(&PP.Y) {
+				if op.isNeg() {
+					// P + -P
+					BK.setInfinity()
+					return
+				}
+				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
 				return
 			}
-		} else {
-			// if bucket == -P, B == 0
-			if BK.X.Equal(&PP.X) && !BK.Y.Equal(&PP.Y) {
-				BK.setInfinity()
+			// b.Y == -p.Y
+			if op.isNeg() {
+				// doubling . 
+				BK.Add(BK, BK)
 				return
 			}
+			BK.setInfinity()
+			return
 		}
+		
 	
-		bucketIds[op.bucketID] = true //struct{}{}
-		R[cptP] = BK
+		bucketIds[op.bucketID] = true
 		if op.isNeg() {
-			P[cptP].Neg(PP)
+			cptSub++
+			R[batchSize - cptSub] = BK
+			P[batchSize - cptSub] = PP
 		} else {
-			P[cptP].Set(PP)
+			R[cptAdd] = BK
+			P[cptAdd] = PP
+			cptAdd++
 		}
-		cptP++
+		
 	}
 	
 
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index c88ca29b88..d1107bb1c2 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1571,26 +1571,30 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 
 
 
-// batch add/dbl in affine coordinates
+// batch add/sub in affine coordinates
 // using batch inversion
 // cost add: 5*batchSize M + 1I, dbl: +1M
-func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) {
-	batchSize := len(R)
+// len(R) == len(P) == N
+// R[:cptAdd], P[:cptAdd] contains points references to ADD
+// R[N-cptSub:], P[N-cptSub] contains points references to SUB
+// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) {
+	batchSize := cptAdd + cptSub
 	if batchSize == 0 {
 		return
 	}
-	var isDbl [MAX_BATCH_SIZE]bool
 	var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}}
 
 
-	for j := 0; j < batchSize; j++ {
-		// detect dbl vs add & compute denominator
-		if P[j].Equal(R[j]) {
-			isDbl[j] = true
-			lambdain[j].Double(&P[j].Y)
-		} else {
-			lambdain[j].Sub(&P[j].X, &R[j].X)
-		}
+	j := 0
+	// add part
+	for j = 0; j < cptAdd; j++ {
+		lambdain[j].Sub(&P[j].X, &R[j].X)
+	}
+	// sub part
+	for i:=len(R) - cptSub  ;i < len(R); i++ {
+		lambdain[j].Sub(&P[i].X, &R[i].X)
+		j++
 	}
 
 	// invert denominator
@@ -1599,17 +1603,11 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) {
 	var d {{.CoordType}}
 	var rr {{ $TAffine }}
 
-	for j := 0; j < batchSize; j++ {
-		// computa lambda, distinguishing dbl / add
-		if isDbl[j] {
-			d.Square(&P[j].X)
-			lambda[j].Mul(&lambda[j], &d)
-			d.Double(&lambda[j])
-			lambda[j].Add(&lambda[j], &d)
-		} else {
-			d.Sub(&P[j].Y, &R[j].Y)
-			lambda[j].Mul(&lambda[j], &d)
-		}
+	// add part
+	for j := 0; j < cptAdd; j++ {
+		// computa lambda
+		d.Sub(&P[j].Y, &R[j].Y)
+		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
@@ -1620,6 +1618,27 @@ func BatchAdd{{ $TAffine }}(R []*{{ $TAffine }}, P []{{ $TAffine }}) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
+
+	// middle of the input may be ignored if cptAdd + cptSub != len(R)
+	offset := len(R) - batchSize
+
+	// sub part
+	for j := cptAdd; j < batchSize; j++ {
+		// computa lambda
+		idx := j+offset
+		d.Neg(&P[idx].Y)
+		d.Sub(&d, &R[idx].Y)
+		lambda[j].Mul(&lambda[j], &d)
+
+		// compute X, Y
+		rr.X.Square(&lambda[j])
+		rr.X.Sub(&rr.X, &R[idx].X)
+		rr.X.Sub(&rr.X, &P[idx].X)
+		d.Sub(&R[idx].X, &rr.X)
+		rr.Y.Mul(&lambda[j], &d)
+		rr.Y.Sub(&rr.Y, &R[idx].Y)
+		R[idx].Set(&rr)
+	}
 }
 
 
diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl
index e033f96dd7..ee54b2dd1b 100644
--- a/internal/generator/ecc/template/tests/point.go.tmpl
+++ b/internal/generator/ecc/template/tests/point.go.tmpl
@@ -562,7 +562,7 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) {
 
 func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
 	var P, R [MAX_BATCH_SIZE]{{ $TAffine }}
-	var RR [MAX_BATCH_SIZE]*{{ $TAffine }}
+	var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }}
 	var ridx [MAX_BATCH_SIZE]int
 
 	fillBenchBases{{ toUpper $.PointName }}(P[:])
@@ -577,11 +577,12 @@ func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
 
 	for i, ri := range ridx {
 		RR[i] = &R[ri]
+		PP[i] = &P[ri]
 	}
 
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		BatchAdd{{ $TAffine }}(RR[:], P[:])
+		batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
 	}
 
 }

From decae893557b67a921146a1fe4f1154d90412f8a Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 14 Nov 2022 12:03:27 -0600
Subject: [PATCH 17/43] feat: revert part of previous commit

---
 ecc/bls12-377/g1.go                           | 35 ++--------------
 ecc/bls12-377/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-377/g2.go                           | 35 ++--------------
 ecc/bls12-377/g2_test.go                      | 41 +++++++++---------
 ecc/bls12-377/multiexp_affine.go              | 42 +++++++------------
 ecc/bls12-378/g1.go                           | 35 ++--------------
 ecc/bls12-378/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-378/g2.go                           | 35 ++--------------
 ecc/bls12-378/g2_test.go                      | 41 +++++++++---------
 ecc/bls12-378/multiexp_affine.go              | 42 +++++++------------
 ecc/bls12-381/g1.go                           | 35 ++--------------
 ecc/bls12-381/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-381/g2.go                           | 35 ++--------------
 ecc/bls12-381/g2_test.go                      | 41 +++++++++---------
 ecc/bls12-381/multiexp_affine.go              | 42 +++++++------------
 ecc/bls24-315/g1.go                           | 35 ++--------------
 ecc/bls24-315/g1_test.go                      | 41 +++++++++---------
 ecc/bls24-315/g2.go                           | 35 ++--------------
 ecc/bls24-315/g2_test.go                      | 41 +++++++++---------
 ecc/bls24-315/multiexp_affine.go              | 42 +++++++------------
 ecc/bls24-317/g1.go                           | 35 ++--------------
 ecc/bls24-317/g1_test.go                      | 41 +++++++++---------
 ecc/bls24-317/g2.go                           | 35 ++--------------
 ecc/bls24-317/g2_test.go                      | 41 +++++++++---------
 ecc/bls24-317/multiexp_affine.go              | 42 +++++++------------
 ecc/bn254/g1.go                               | 35 ++--------------
 ecc/bn254/g1_test.go                          | 41 +++++++++---------
 ecc/bn254/g2.go                               | 35 ++--------------
 ecc/bn254/g2_test.go                          | 41 +++++++++---------
 ecc/bn254/multiexp_affine.go                  | 42 +++++++------------
 ecc/bw6-633/g1.go                             | 35 ++--------------
 ecc/bw6-633/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-633/g2.go                             | 35 ++--------------
 ecc/bw6-633/g2_test.go                        | 41 +++++++++---------
 ecc/bw6-633/multiexp_affine.go                | 42 +++++++------------
 ecc/bw6-756/g1.go                             | 35 ++--------------
 ecc/bw6-756/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-756/g2.go                             | 35 ++--------------
 ecc/bw6-756/g2_test.go                        | 41 +++++++++---------
 ecc/bw6-756/multiexp_affine.go                | 42 +++++++------------
 ecc/bw6-761/g1.go                             | 35 ++--------------
 ecc/bw6-761/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-761/g2.go                             | 35 ++--------------
 ecc/bw6-761/g2_test.go                        | 41 +++++++++---------
 ecc/bw6-761/multiexp_affine.go                | 42 +++++++------------
 .../ecc/template/multiexp_affine.go.tmpl      | 21 ++++------
 internal/generator/ecc/template/point.go.tmpl | 35 ++--------------
 .../ecc/template/tests/point.go.tmpl          | 41 +++++++++---------
 48 files changed, 608 insertions(+), 1235 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 3602b14992..3be98b91a4 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go
index 105a2d0a9a..eb09d3cca4 100644
--- a/ecc/bls12-377/g1_test.go
+++ b/ecc/bls12-377/g1_test.go
@@ -19,7 +19,6 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 04c0f5fac5..4b6f3de628 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -983,23 +983,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1008,7 +1002,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1022,27 +1016,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go
index 9048e32439..c0653c32af 100644
--- a/ecc/bls12-377/g2_test.go
+++ b/ecc/bls12-377/g2_test.go
@@ -19,7 +19,6 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower"
@@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index c8b2686337..41c16a3afe 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index d36be41445..1545108a66 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go
index a3603c49cb..6752818d29 100644
--- a/ecc/bls12-378/g1_test.go
+++ b/ecc/bls12-378/g1_test.go
@@ -19,7 +19,6 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 9803f61512..26aaa42624 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -983,23 +983,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1008,7 +1002,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1022,27 +1016,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go
index ffe94dbc2e..a9632dc413 100644
--- a/ecc/bls12-378/g2_test.go
+++ b/ecc/bls12-378/g2_test.go
@@ -19,7 +19,6 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower"
@@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index b30717ffea..95eb76b3ac 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index b05d04acc8..5a59011791 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -987,23 +987,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1012,7 +1006,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1026,27 +1020,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go
index 68a84cf073..223c3763c0 100644
--- a/ecc/bls12-381/g1_test.go
+++ b/ecc/bls12-381/g1_test.go
@@ -19,7 +19,6 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index c69c7f0444..6b7dfa5639 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -984,23 +984,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1009,7 +1003,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1023,27 +1017,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go
index 129a541689..be4957738e 100644
--- a/ecc/bls12-381/g2_test.go
+++ b/ecc/bls12-381/g2_test.go
@@ -19,7 +19,6 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower"
@@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index a2a7eb8ffb..5a51ee46b6 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index 86e394a710..cd0d0a8a69 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -989,23 +989,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1014,7 +1008,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1028,27 +1022,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go
index d3840c2537..4ffe3679c7 100644
--- a/ecc/bls24-315/g1_test.go
+++ b/ecc/bls24-315/g1_test.go
@@ -19,7 +19,6 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 6170c188b6..7fa2e026c3 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -999,23 +999,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1024,7 +1018,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1038,27 +1032,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go
index 2e97c208c0..019fa5ec24 100644
--- a/ecc/bls24-315/g2_test.go
+++ b/ecc/bls24-315/g2_test.go
@@ -19,7 +19,6 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower"
@@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 5db551830d..3714629530 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index 1d4e27c062..4a28082836 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -989,23 +989,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1014,7 +1008,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1028,27 +1022,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go
index 59fd0c425f..3a89f924e5 100644
--- a/ecc/bls24-317/g1_test.go
+++ b/ecc/bls24-317/g1_test.go
@@ -19,7 +19,6 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index bbfcfd12b2..acd1176cdd 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -999,23 +999,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1024,7 +1018,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1038,27 +1032,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go
index f02c85f79d..1d7ed1f3ff 100644
--- a/ecc/bls24-317/g2_test.go
+++ b/ecc/bls24-317/g2_test.go
@@ -19,7 +19,6 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower"
@@ -506,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index b517f1ef32..789e3e18be 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 6f9a4d4e59..84e72c7c5f 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -959,23 +959,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -984,7 +978,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -998,27 +992,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go
index ffc6160b01..2e1973a911 100644
--- a/ecc/bn254/g1_test.go
+++ b/ecc/bn254/g1_test.go
@@ -19,7 +19,6 @@ package bn254
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
@@ -461,32 +460,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 762a6f944b..2bef2cba4c 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -988,23 +988,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1013,7 +1007,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1027,27 +1021,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go
index 9d0b38a11a..ae107fea78 100644
--- a/ecc/bn254/g2_test.go
+++ b/ecc/bn254/g2_test.go
@@ -19,7 +19,6 @@ package bn254
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower"
@@ -505,32 +504,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 48b5e6e242..9880a1276b 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -284,9 +279,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -294,19 +288,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -350,16 +343,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index 9e61e67732..d70a92aeec 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1091,23 +1091,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1116,7 +1110,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1130,27 +1124,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go
index 91e28e75e0..a2b6c273f6 100644
--- a/ecc/bw6-633/g1_test.go
+++ b/ecc/bw6-633/g1_test.go
@@ -19,7 +19,6 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index 6f021168b5..a84adbb320 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -954,23 +954,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -979,7 +973,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -993,27 +987,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go
index a51ae94c50..f5c4d5edca 100644
--- a/ecc/bw6-633/g2_test.go
+++ b/ecc/bw6-633/g2_test.go
@@ -19,7 +19,6 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 74c9b3d4dc..5b679cba91 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index fee3c6884b..57631a43e5 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1091,23 +1091,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1116,7 +1110,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1130,27 +1124,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go
index cfc93383c9..bd7a65f693 100644
--- a/ecc/bw6-756/g1_test.go
+++ b/ecc/bw6-756/g1_test.go
@@ -19,7 +19,6 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 195322273e..c2b10451c0 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -948,23 +948,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -973,7 +967,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -987,27 +981,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go
index 699df087ed..7d98c06668 100644
--- a/ecc/bw6-756/g2_test.go
+++ b/ecc/bw6-756/g2_test.go
@@ -19,7 +19,6 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 02bc11523c..fade05f3ff 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 3537495b46..08f5e476d9 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1102,23 +1102,17 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -1127,7 +1121,7 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 	var rr G1Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1141,27 +1135,6 @@ func batchAddG1Affine(R, P []*G1Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go
index 5b1b389102..4cbc725f60 100644
--- a/ecc/bw6-761/g1_test.go
+++ b/ecc/bw6-761/g1_test.go
@@ -19,7 +19,6 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -500,32 +499,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG1Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G1Affine
-	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG1Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G1Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG1(P[:])
-	fillBenchBasesG1(R[:])
+// 	fillBenchBasesG1(P[:])
+// 	fillBenchBasesG1(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 41cfea623f..48a7a69586 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -962,23 +962,17 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i := len(R) - cptSub; i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
@@ -987,7 +981,7 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 	var rr G2Affine
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1001,27 +995,6 @@ func batchAddG2Affine(R, P []*G2Affine, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j + offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 // batch inversion
diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go
index 76d8b7f7de..7fa415d6a5 100644
--- a/ecc/bw6-761/g2_test.go
+++ b/ecc/bw6-761/g2_test.go
@@ -19,7 +19,6 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
-	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -487,32 +486,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAddG2Affine(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]G2Affine
-	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAddG2Affine(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]G2Affine
+// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBasesG2(P[:])
-	fillBenchBasesG2(R[:])
+// 	fillBenchBasesG2(P[:])
+// 	fillBenchBasesG2(R[:])
 
-	for i := 0; i < len(ridx); i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE/2, MAX_BATCH_SIZE/2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 4c9c97691d..325f500b60 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -56,9 +56,8 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G1Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -66,19 +65,18 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -122,16 +120,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
@@ -266,9 +261,8 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
-	cptSub := 0      // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*G2Affine // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -276,19 +270,18 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	}
 
 	isFull := func() bool {
-		return (cptAdd + cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd + cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -332,16 +325,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		}
 
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize-cptSub] = BK
-			P[batchSize-cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-
+		cptAdd++
 	}
 
 	var queue [MAX_BATCH_SIZE]batchOp
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 8902aeb919..044f5d3cc9 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -57,9 +57,8 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0 // count the number of bucket + point added to current batch
-	cptSub := 0 // count the number of bucket - point added to current batch
 
-	var P [MAX_BATCH_SIZE]*{{ $.TAffine }} // points to be added to R (buckets)
+	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets)
 	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references
 
 	canAdd := func(bID uint32) bool {
@@ -67,19 +66,18 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 	}
 
 	isFull := func() bool {
-		return (cptAdd+cptSub) == batchSize
+		return (cptAdd) == batchSize
 	}
 
 	executeAndReset := func ()  {
-		if (cptAdd+cptSub) == 0 {
+		if (cptAdd) == 0 {
 			return
 		}
-		batchAdd{{ $.TAffine }}(R[:batchSize], P[:batchSize], cptAdd, cptSub)
+		batchAdd{{ $.TAffine }}(R[:cptAdd], P[:cptAdd])
 		
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
-		cptSub = 0
 	}
 
 	add := func(op batchOp) {
@@ -124,16 +122,13 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 		
 	
 		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
 		if op.isNeg() {
-			cptSub++
-			R[batchSize - cptSub] = BK
-			P[batchSize - cptSub] = PP
+			P[cptAdd].Neg(PP)
 		} else {
-			R[cptAdd] = BK
-			P[cptAdd] = PP
-			cptAdd++
+			P[cptAdd].Set(PP)
 		}
-		
+		cptAdd++
 	}
 	
 
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index d1107bb1c2..2a3b418cbb 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1578,24 +1578,18 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 // R[:cptAdd], P[:cptAdd] contains points references to ADD
 // R[N-cptSub:], P[N-cptSub] contains points references to SUB
 // cptAdd + cptSub == batchSize, and batchSize may be smaller than N
-func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) {
-	batchSize := cptAdd + cptSub
+func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) {
+	batchSize := len(R)
 	if batchSize == 0 {
 		return
 	}
 	var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}}
 
 
-	j := 0
 	// add part
-	for j = 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		lambdain[j].Sub(&P[j].X, &R[j].X)
 	}
-	// sub part
-	for i:=len(R) - cptSub  ;i < len(R); i++ {
-		lambdain[j].Sub(&P[i].X, &R[i].X)
-		j++
-	}
 
 	// invert denominator
 	batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize])
@@ -1604,7 +1598,7 @@ func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) {
 	var rr {{ $TAffine }}
 
 	// add part
-	for j := 0; j < cptAdd; j++ {
+	for j := 0; j < batchSize; j++ {
 		// computa lambda
 		d.Sub(&P[j].Y, &R[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
@@ -1618,27 +1612,6 @@ func batchAdd{{ $TAffine }}(R,P []*{{ $TAffine }}, cptAdd, cptSub int) {
 		rr.Y.Sub(&rr.Y, &R[j].Y)
 		R[j].Set(&rr)
 	}
-
-	// middle of the input may be ignored if cptAdd + cptSub != len(R)
-	offset := len(R) - batchSize
-
-	// sub part
-	for j := cptAdd; j < batchSize; j++ {
-		// computa lambda
-		idx := j+offset
-		d.Neg(&P[idx].Y)
-		d.Sub(&d, &R[idx].Y)
-		lambda[j].Mul(&lambda[j], &d)
-
-		// compute X, Y
-		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[idx].X)
-		rr.X.Sub(&rr.X, &P[idx].X)
-		d.Sub(&R[idx].X, &rr.X)
-		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[idx].Y)
-		R[idx].Set(&rr)
-	}
 }
 
 
diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl
index ee54b2dd1b..223bfbe040 100644
--- a/internal/generator/ecc/template/tests/point.go.tmpl
+++ b/internal/generator/ecc/template/tests/point.go.tmpl
@@ -16,7 +16,6 @@ import (
 	"fmt"
 	"math/big"
 	"testing"
-	"math/rand"
 
 	{{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}}
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
@@ -560,32 +559,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) {
 
 }
 
-func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
-	var P, R [MAX_BATCH_SIZE]{{ $TAffine }}
-	var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }}
-	var ridx [MAX_BATCH_SIZE]int
+// func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
+// 	var P, R [MAX_BATCH_SIZE]{{ $TAffine }}
+// 	var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }}
+// 	var ridx [MAX_BATCH_SIZE]int
 
-	fillBenchBases{{ toUpper $.PointName }}(P[:])
-	fillBenchBases{{ toUpper $.PointName }}(R[:])
+// 	fillBenchBases{{ toUpper $.PointName }}(P[:])
+// 	fillBenchBases{{ toUpper $.PointName }}(R[:])
 
-	for i:=0; i < len(ridx);i++ {
-		ridx[i] = i
-	}
+// 	for i:=0; i < len(ridx);i++ {
+// 		ridx[i] = i
+// 	}
 
-	// random permute
-	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+// 	// random permute
+// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-	for i, ri := range ridx {
-		RR[i] = &R[ri]
-		PP[i] = &P[ri]
-	}
+// 	for i, ri := range ridx {
+// 		RR[i] = &R[ri]
+// 		PP[i] = &P[ri]
+// 	}
 
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-	}
+// 	b.ResetTimer()
+// 	for i := 0; i < b.N; i++ {
+// 		batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
+// 	}
 
-}
+// }
 
 func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled

From d60bf24592e54e114772825230c0d115cdbb1a4d Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 14 Nov 2022 12:19:30 -0600
Subject: [PATCH 18/43] feat: since we cap c==16 we may as well use uint16

---
 ecc/bls12-377/g1.go                           |  8 ++---
 ecc/bls12-377/g2.go                           |  8 ++---
 ecc/bls12-377/multiexp.go                     | 18 +++++-----
 ecc/bls12-377/multiexp_affine.go              | 35 +++++++++++--------
 ecc/bls12-377/multiexp_jacobian.go            |  4 +--
 ecc/bls12-378/g1.go                           |  8 ++---
 ecc/bls12-378/g2.go                           |  8 ++---
 ecc/bls12-378/multiexp.go                     | 18 +++++-----
 ecc/bls12-378/multiexp_affine.go              | 35 +++++++++++--------
 ecc/bls12-378/multiexp_jacobian.go            |  4 +--
 ecc/bls12-381/g1.go                           |  8 ++---
 ecc/bls12-381/g2.go                           |  8 ++---
 ecc/bls12-381/multiexp.go                     | 18 +++++-----
 ecc/bls12-381/multiexp_affine.go              | 35 +++++++++++--------
 ecc/bls12-381/multiexp_jacobian.go            |  4 +--
 ecc/bls24-315/g1.go                           |  8 ++---
 ecc/bls24-315/g2.go                           |  8 ++---
 ecc/bls24-315/multiexp.go                     | 18 +++++-----
 ecc/bls24-315/multiexp_affine.go              | 35 +++++++++++--------
 ecc/bls24-315/multiexp_jacobian.go            |  4 +--
 ecc/bls24-317/g1.go                           |  8 ++---
 ecc/bls24-317/g2.go                           |  8 ++---
 ecc/bls24-317/multiexp.go                     | 18 +++++-----
 ecc/bls24-317/multiexp_affine.go              | 35 +++++++++++--------
 ecc/bls24-317/multiexp_jacobian.go            |  4 +--
 ecc/bn254/g1.go                               |  8 ++---
 ecc/bn254/g2.go                               |  8 ++---
 ecc/bn254/multiexp.go                         | 18 +++++-----
 ecc/bn254/multiexp_affine.go                  | 35 +++++++++++--------
 ecc/bn254/multiexp_jacobian.go                |  4 +--
 ecc/bw6-633/g1.go                             |  8 ++---
 ecc/bw6-633/g2.go                             |  8 ++---
 ecc/bw6-633/multiexp.go                       | 18 +++++-----
 ecc/bw6-633/multiexp_affine.go                | 35 +++++++++++--------
 ecc/bw6-633/multiexp_jacobian.go              |  4 +--
 ecc/bw6-756/g1.go                             |  8 ++---
 ecc/bw6-756/g2.go                             |  8 ++---
 ecc/bw6-756/multiexp.go                       | 18 +++++-----
 ecc/bw6-756/multiexp_affine.go                | 35 +++++++++++--------
 ecc/bw6-756/multiexp_jacobian.go              |  4 +--
 ecc/bw6-761/g1.go                             |  8 ++---
 ecc/bw6-761/g2.go                             |  8 ++---
 ecc/bw6-761/multiexp.go                       | 18 +++++-----
 ecc/bw6-761/multiexp_affine.go                | 35 +++++++++++--------
 ecc/bw6-761/multiexp_jacobian.go              |  4 +--
 .../generator/ecc/template/multiexp.go.tmpl   | 14 ++++----
 .../ecc/template/multiexp_affine.go.tmpl      | 20 ++++++-----
 .../ecc/template/multiexp_jacobian.go.tmpl    |  2 +-
 internal/generator/ecc/template/point.go.tmpl |  8 ++---
 49 files changed, 337 insertions(+), 364 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 3be98b91a4..590c40246e 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 4b6f3de628..92a51f4b54 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -976,13 +976,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 53a1823e0d..6360f5cb35 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 41c16a3afe..eef9112dda 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -19,7 +19,8 @@ package bls12377
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index be722067bd..ae9aca6c47 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 1545108a66..08bd9e0fda 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 26aaa42624..840049be26 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -976,13 +976,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 73e162f80d..e88325d4ae 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 95eb76b3ac..6023381a17 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -19,7 +19,8 @@ package bls12378
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 6a8cfa2d32..0637114932 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index 5a59011791..6c3edabedc 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -980,13 +980,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 6b7dfa5639..3768ec7eda 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -977,13 +977,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 80ff8bfc30..fabd850c74 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 5a51ee46b6..f388d3d5ea 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -19,7 +19,8 @@ package bls12381
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index fabbf2d237..17139a4f22 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index cd0d0a8a69..25faa396cf 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -982,13 +982,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 7fa2e026c3..32601c0b08 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -992,13 +992,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index f61ab96f3d..207a4c7f23 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 3714629530..9ec9c35382 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -19,7 +19,8 @@ package bls24315
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index a3d633de01..6e3ea0e2f9 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index 4a28082836..f1e3773049 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -982,13 +982,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index acd1176cdd..0f1693cdc8 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -992,13 +992,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 8b81840e50..b9baa2cec7 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 789e3e18be..e27eb9efeb 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -19,7 +19,8 @@ package bls24317
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index 7e832db4e7..c4fc41bc54 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 84e72c7c5f..75a3e25983 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -952,13 +952,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 2bef2cba4c..6437e4542e 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -981,13 +981,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index ac979ddff7..75e8a96061 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -205,8 +205,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -453,8 +453,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -541,14 +541,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -630,11 +630,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 9880a1276b..1f6ba85280 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -19,7 +19,8 @@ package bn254
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -259,7 +262,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -280,10 +283,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -391,10 +397,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -411,7 +417,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index a682232ec6..288063d39a 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -99,7 +99,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index d70a92aeec..707fbefb6f 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1084,13 +1084,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index a84adbb320..69e4b4263c 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -947,13 +947,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 23c35d3d90..fb6367cb6b 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -168,8 +168,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -379,8 +379,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -467,14 +467,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -556,11 +556,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 5b679cba91..38c1973d0e 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -19,7 +19,8 @@ package bw6633
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index 29756cc499..e39d7fc165 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index 57631a43e5..395f0a7f84 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1084,13 +1084,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index c2b10451c0..63a4631e6d 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -941,13 +941,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 0124b603f4..0ba6a6ed57 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -169,8 +169,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -381,8 +381,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -469,14 +469,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -558,11 +558,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index fade05f3ff..c59b38e882 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -19,7 +19,8 @@ package bw6756
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 10a354ae58..0cba708584 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 08f5e476d9..880371b042 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1095,13 +1095,9 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 	return toReturnAff
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 48a7a69586..892cedad40 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -955,13 +955,9 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 	return toReturn
 }
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	batchSize := len(R)
 	if batchSize == 0 {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index fc2c7c4908..f8165ca221 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -169,8 +169,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint32)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -381,8 +381,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint32)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
 	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
@@ -469,14 +469,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs*64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -558,11 +558,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 325f500b60..83b2c11fbe 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -19,7 +19,8 @@ package bw6761
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID  uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -36,7 +37,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -57,10 +58,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G1Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G1Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G1Affine
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G1Affine
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -168,10 +172,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -188,7 +192,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
@@ -241,7 +244,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -262,10 +265,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]G2Affine  // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*G2Affine // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*G2Affine
+
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]G2Affine
 
-	canAdd := func(bID uint32) bool {
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -373,10 +379,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit >> 1) - 1)
+			op.bucketID = uint16((digit >> 1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit >> 1)))
+			op.bucketID = (uint16((digit >> 1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -393,7 +399,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 045bace5e7..af2d68b853 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -20,7 +20,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
@@ -77,7 +77,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
-	digits []uint32) {
+	digits []uint16) {
 
 	var buckets B
 	for i := 0; i < len(buckets); i++ {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 5940c0e8c5..4655925e5e 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -40,14 +40,14 @@ type selector struct {
 // scalarsMont indicates wheter the provided scalars are in montgomery form
 // returns smallValues, which represent the number of scalars which meets the following condition
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint32, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
 	nbChunks := fr.Limbs * 64 / c
 	if (fr.Limbs * 64)%c != 0 {
 		nbChunks++
 	}
 
-	toReturn := make([]uint32, len(scalars)*int(nbChunks))
+	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask  := uint64((1 << c) - 1) 		// low c bits are 1
 	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
@@ -131,11 +131,11 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint32
+				var bits uint16
 				if digit >= 0 {
-					bits = uint32(digit) << 1
+					bits = uint16(digit) << 1
 				} else {
-					bits = (uint32(-digit-1) << 1) + 1
+					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
 				// [s.index] |= (bits << s.shift)
@@ -447,8 +447,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 	}
 }
 
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint32, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint32)) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool,
+	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} {
 	
 	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
 	if (fr.Limbs*64)%c != 0 {
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 044f5d3cc9..cf1c0ce71f 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -10,7 +10,8 @@
 const MAX_BATCH_SIZE = 600
 
 type batchOp struct {
-	bucketID, pointID uint32
+	pointID uint32
+	bucketID uint16
 }
 
 func (o batchOp) isNeg() bool {
@@ -37,7 +38,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
-	 digits []uint32) {
+	 digits []uint16) {
 
 	// init the buckets
 	var buckets B
@@ -58,10 +59,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0 // count the number of bucket + point added to current batch
 
-	var P [MAX_BATCH_SIZE]{{ $.TAffine }} // points to be added to R (buckets)
-	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} // bucket references
+	// bucket references
+	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} 
 
-	canAdd := func(bID uint32) bool {
+	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var P [MAX_BATCH_SIZE]{{ $.TAffine }} 
+
+
+	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
 
@@ -171,10 +176,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 		// if msbWindow bit is set, we need to substract
 		if digit&1 == 0 {
 			// add
-			op.bucketID = uint32((digit>>1) - 1)
+			op.bucketID = uint16((digit>>1) - 1)
 		} else {
 			// sub
-			op.bucketID = (uint32((digit>>1)))
+			op.bucketID = (uint16((digit>>1)))
 			op.pointID += 1
 		}
 		if canAdd(op.bucketID) {
@@ -191,7 +196,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 				executeAndReset()
 				processQueue()
 			}
-			// queue = append(queue, op)
 		}
 	}
 
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index ee1f1d2080..7aaec9f186 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -19,7 +19,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 	chRes chan<- {{ $.TJacobianExtended }},
 	c uint64,
 	points []{{ $.TAffine }},
-	digits []uint32) {
+	digits []uint16) {
 
 
 
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index 2a3b418cbb..6e30ad9de8 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1571,13 +1571,9 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 
 
 
-// batch add/sub in affine coordinates
+// batch add affine coordinates
 // using batch inversion
-// cost add: 5*batchSize M + 1I, dbl: +1M
-// len(R) == len(P) == N
-// R[:cptAdd], P[:cptAdd] contains points references to ADD
-// R[N-cptSub:], P[N-cptSub] contains points references to SUB
-// cptAdd + cptSub == batchSize, and batchSize may be smaller than N
+// special cases (doubling, infinity) must be filtered out before this call
 func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) {
 	batchSize := len(R)
 	if batchSize == 0 {

From 3fd6c7e5f21cc68ca0562317c118ff65513e0f20 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 14 Nov 2022 15:54:41 -0600
Subject: [PATCH 19/43] perf: allocate batch affine arrays on the stack with
 generics

---
 ecc/bls12-377/g1.go                           |  61 +--
 ecc/bls12-377/g2.go                           |  61 +--
 ecc/bls12-377/multiexp.go                     |  28 +-
 ecc/bls12-377/multiexp_affine.go              | 494 ++++++++++++------
 ecc/bls12-378/g1.go                           |  61 +--
 ecc/bls12-378/g2.go                           |  61 +--
 ecc/bls12-378/multiexp.go                     |  28 +-
 ecc/bls12-378/multiexp_affine.go              | 494 ++++++++++++------
 ecc/bls12-381/g1.go                           |  61 +--
 ecc/bls12-381/g2.go                           |  61 +--
 ecc/bls12-381/multiexp.go                     |  28 +-
 ecc/bls12-381/multiexp_affine.go              | 494 ++++++++++++------
 ecc/bls24-315/g1.go                           |  61 +--
 ecc/bls24-315/g2.go                           |  61 +--
 ecc/bls24-315/multiexp.go                     |  28 +-
 ecc/bls24-315/multiexp_affine.go              | 494 ++++++++++++------
 ecc/bls24-317/g1.go                           |  61 +--
 ecc/bls24-317/g2.go                           |  61 +--
 ecc/bls24-317/multiexp.go                     |  28 +-
 ecc/bls24-317/multiexp_affine.go              | 494 ++++++++++++------
 ecc/bn254/g1.go                               |  61 +--
 ecc/bn254/g2.go                               |  61 +--
 ecc/bn254/multiexp.go                         |  28 +-
 ecc/bn254/multiexp_affine.go                  | 494 ++++++++++++------
 ecc/bw6-633/g1.go                             |  61 +--
 ecc/bw6-633/g2.go                             |  61 +--
 ecc/bw6-633/multiexp.go                       |   4 +-
 ecc/bw6-633/multiexp_affine.go                | 386 ++++++++------
 ecc/bw6-756/g1.go                             |  61 +--
 ecc/bw6-756/g2.go                             |  61 +--
 ecc/bw6-756/multiexp.go                       |   4 +-
 ecc/bw6-756/multiexp_affine.go                | 386 ++++++++------
 ecc/bw6-761/g1.go                             |  61 +--
 ecc/bw6-761/g2.go                             |  61 +--
 ecc/bw6-761/multiexp.go                       |   4 +-
 ecc/bw6-761/multiexp_affine.go                | 386 ++++++++------
 internal/generator/ecc/generate.go            |  32 ++
 .../generator/ecc/template/multiexp.go.tmpl   |   4 +-
 .../ecc/template/multiexp_affine.go.tmpl      | 236 ++++++---
 internal/generator/ecc/template/point.go.tmpl |  62 +--
 40 files changed, 3504 insertions(+), 2230 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 590c40246e..910dd07b5e 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 92a51f4b54..0fe9a4119c 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -979,20 +979,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E2
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E2
 	var rr G2Affine
@@ -1000,36 +1011,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E2, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E2) {
-
-	var accumulator fptower.E2
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 6360f5cb35..9f2a1998fc 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index eef9112dda..c1d32b5ded 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bls12377
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E2
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E2
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E2
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E2
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E2
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E2
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E2
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E2
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 08bd9e0fda..67d64790d7 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 840049be26..905e2ba893 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -979,20 +979,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E2
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E2
 	var rr G2Affine
@@ -1000,36 +1011,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E2, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E2) {
-
-	var accumulator fptower.E2
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index e88325d4ae..d65962591c 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 6023381a17..f060ffc11a 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bls12378
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E2
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E2
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E2
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E2
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E2
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E2
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E2
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E2
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index 6c3edabedc..474c868025 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -983,20 +983,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1004,36 +1015,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index 3768ec7eda..a8575f59f7 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -980,20 +980,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E2
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E2
 	var rr G2Affine
@@ -1001,36 +1012,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E2, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E2) {
-
-	var accumulator fptower.E2
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index fabd850c74..edcb161b5e 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index f388d3d5ea..da6be7a817 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bls12381
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E2
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E2
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E2
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E2
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E2
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E2
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E2
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E2
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index 25faa396cf..bde8a50d43 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -985,20 +985,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1006,36 +1017,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 32601c0b08..662bfe0313 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -995,20 +995,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E4
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E4
 	var rr G2Affine
@@ -1016,36 +1027,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E4, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E4) {
-
-	var accumulator fptower.E4
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 207a4c7f23..ebaf6a86f7 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 9ec9c35382..800c106b7d 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bls24315
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E4
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E4
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E4
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E4
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E4
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E4
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E4
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E4
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index f1e3773049..cd9452b1ce 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -985,20 +985,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1006,36 +1017,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 0f1693cdc8..96d823eaf9 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -995,20 +995,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E4
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E4
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E4
 	var rr G2Affine
@@ -1016,36 +1027,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E4, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E4) {
-
-	var accumulator fptower.E4
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index b9baa2cec7..c05f920246 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index e27eb9efeb..f1fb40dea1 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bls24317
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
+	"github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E4
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E4
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E4
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E4
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E4
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E4
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E4
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E4
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 75a3e25983..5bad4b316c 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -955,20 +955,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -976,36 +987,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 6437e4542e..09011b0c53 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -984,20 +984,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fptower.E2
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fptower.E2
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fptower.E2
 	var rr G2Affine
@@ -1005,36 +1016,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfptower.E2, ignores edge cases
-func batchInvertG2Affine(res, a []fptower.E2) {
-
-	var accumulator fptower.E2
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 75e8a96061..6989e58d4b 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 1f6ba85280..8eeded8aa1 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bn254
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+	"github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
 			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
+			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,12 +246,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC6 [1 << (6 - 1)]G1Affine
-type bucketG1AffineC7 [1 << (7 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
-type bucketG1AffineC9 [1 << (9 - 1)]G1Affine
 type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
@@ -236,14 +254,9 @@ type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
 type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC6 |
-		bucketG1AffineC7 |
-		bucketG1AffineC8 |
-		bucketG1AffineC9 |
-		bucketG1AffineC10 |
+	bucketG1AffineC10 |
 		bucketG1AffineC11 |
 		bucketG1AffineC12 |
 		bucketG1AffineC13 |
@@ -252,13 +265,95 @@ type ibG1Affine interface {
 		bucketG1AffineC16
 }
 
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC10 |
+		cG1AffineC11 |
+		cG1AffineC12 |
+		cG1AffineC13 |
+		cG1AffineC14 |
+		cG1AffineC15 |
+		cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC10 |
+		pG1AffineC11 |
+		pG1AffineC12 |
+		pG1AffineC13 |
+		pG1AffineC14 |
+		pG1AffineC15 |
+		pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC10 |
+		ppG1AffineC11 |
+		ppG1AffineC12 |
+		ppG1AffineC13 |
+		ppG1AffineC14 |
+		ppG1AffineC15 |
+		ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC10 |
+		qOpsG1AffineC11 |
+		qOpsG1AffineC12 |
+		qOpsG1AffineC13 |
+		qOpsG1AffineC14 |
+		qOpsG1AffineC15 |
+		qOpsG1AffineC16
+}
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qOpsG1AffineC10 [80]batchOpG1Affine
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qOpsG1AffineC11 [150]batchOpG1Affine
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qOpsG1AffineC12 [200]batchOpG1Affine
+type cG1AffineC13 [350]fp.Element
+type pG1AffineC13 [350]G1Affine
+type ppG1AffineC13 [350]*G1Affine
+type qOpsG1AffineC13 [350]batchOpG1Affine
+type cG1AffineC14 [400]fp.Element
+type pG1AffineC14 [400]G1Affine
+type ppG1AffineC14 [400]*G1Affine
+type qOpsG1AffineC14 [400]batchOpG1Affine
+type cG1AffineC15 [500]fp.Element
+type pG1AffineC15 [500]G1Affine
+type ppG1AffineC15 [500]*G1Affine
+type qOpsG1AffineC15 [500]batchOpG1Affine
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
+}
+
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -272,22 +367,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -301,76 +387,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -379,7 +487,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -389,40 +497,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -447,12 +562,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC6 [1 << (6 - 1)]G2Affine
-type bucketG2AffineC7 [1 << (7 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
-type bucketG2AffineC9 [1 << (9 - 1)]G2Affine
 type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
@@ -461,14 +570,9 @@ type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
 type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC6 |
-		bucketG2AffineC7 |
-		bucketG2AffineC8 |
-		bucketG2AffineC9 |
-		bucketG2AffineC10 |
+	bucketG2AffineC10 |
 		bucketG2AffineC11 |
 		bucketG2AffineC12 |
 		bucketG2AffineC13 |
@@ -477,6 +581,78 @@ type ibG2Affine interface {
 		bucketG2AffineC16
 }
 
+// array of coordinates fptower.E2
+type cG2Affine interface {
+	cG2AffineC10 |
+		cG2AffineC11 |
+		cG2AffineC12 |
+		cG2AffineC13 |
+		cG2AffineC14 |
+		cG2AffineC15 |
+		cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC10 |
+		pG2AffineC11 |
+		pG2AffineC12 |
+		pG2AffineC13 |
+		pG2AffineC14 |
+		pG2AffineC15 |
+		pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC10 |
+		ppG2AffineC11 |
+		ppG2AffineC12 |
+		ppG2AffineC13 |
+		ppG2AffineC14 |
+		ppG2AffineC15 |
+		ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC10 |
+		qOpsG2AffineC11 |
+		qOpsG2AffineC12 |
+		qOpsG2AffineC13 |
+		qOpsG2AffineC14 |
+		qOpsG2AffineC15 |
+		qOpsG2AffineC16
+}
+type cG2AffineC10 [80]fptower.E2
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qOpsG2AffineC10 [80]batchOpG2Affine
+type cG2AffineC11 [150]fptower.E2
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qOpsG2AffineC11 [150]batchOpG2Affine
+type cG2AffineC12 [200]fptower.E2
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qOpsG2AffineC12 [200]batchOpG2Affine
+type cG2AffineC13 [350]fptower.E2
+type pG2AffineC13 [350]G2Affine
+type ppG2AffineC13 [350]*G2Affine
+type qOpsG2AffineC13 [350]batchOpG2Affine
+type cG2AffineC14 [400]fptower.E2
+type pG2AffineC14 [400]G2Affine
+type ppG2AffineC14 [400]*G2Affine
+type qOpsG2AffineC14 [400]batchOpG2Affine
+type cG2AffineC15 [500]fptower.E2
+type pG2AffineC15 [500]G2Affine
+type ppG2AffineC15 [500]*G2Affine
+type qOpsG2AffineC15 [500]batchOpG2Affine
+type cG2AffineC16 [640]fptower.E2
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
+
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index 707fbefb6f..dc2289ac76 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1087,20 +1087,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1108,36 +1119,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index 69e4b4263c..3d27026424 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -950,20 +950,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G2Affine
@@ -971,36 +982,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG2Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index fb6367cb6b..d329dacf85 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -161,7 +161,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -372,7 +372,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 38c1973d0e..fd164fefd2 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bw6633
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
+	"github.com/consensys/gnark-crypto/ecc/bw6-633/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
 			return
 		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC8 |
-		bucketG1AffineC16
+	bucketG1AffineC16
+}
+
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC16
+}
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -240,7 +292,8 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC8 |
-		bucketG2AffineC16
+	bucketG2AffineC16
+}
+
+// array of coordinates fp.Element
+type cG2Affine interface {
+	cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC16
 }
+type cG2AffineC16 [640]fp.Element
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index 395f0a7f84..5cbf001665 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1087,20 +1087,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1108,36 +1119,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 63a4631e6d..e8b048fb9b 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -944,20 +944,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G2Affine
@@ -965,36 +976,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG2Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 0ba6a6ed57..719bca28bf 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -162,7 +162,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -374,7 +374,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index c59b38e882..a9f8f12db5 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bw6756
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
+	"github.com/consensys/gnark-crypto/ecc/bw6-756/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
 			return
 		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC8 |
-		bucketG1AffineC16
+	bucketG1AffineC16
+}
+
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC16
+}
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -240,7 +292,8 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC8 |
-		bucketG2AffineC16
+	bucketG2AffineC16
+}
+
+// array of coordinates fp.Element
+type cG2Affine interface {
+	cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC16
 }
+type cG2AffineC16 [640]fp.Element
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 880371b042..d6de060519 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1098,20 +1098,31 @@ func BatchScalarMultiplicationG1(base *G1Affine, scalars []fr.Element) []G1Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG1Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G1Affine
@@ -1119,36 +1130,16 @@ func batchAddG1Affine(R []*G1Affine, P []G1Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG1Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 892cedad40..b1b8b664dd 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -958,20 +958,31 @@ func BatchScalarMultiplicationG2(base *G2Affine, scalars []fr.Element) []G2Affin
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]fp.Element
+func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvertG2Affine(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator fp.Element
+		accumulator.SetOne()
+
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+
+		accumulator.Inverse(&accumulator)
+
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d fp.Element
 	var rr G2Affine
@@ -979,36 +990,16 @@ func batchAddG2Affine(R []*G2Affine, P []G2Affine) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
-	}
-}
-
-// batch inversion
-// similar to BatchInvertfp.Element, ignores edge cases
-func batchInvertG2Affine(res, a []fp.Element) {
-
-	var accumulator fp.Element
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index f8165ca221..1fce2c8080 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -162,7 +162,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -374,7 +374,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 83b2c11fbe..638f888c2a 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -16,15 +16,18 @@
 
 package bw6761
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
+	"github.com/consensys/gnark-crypto/ecc/bw6-761/internal/fptower"
+)
 
-type batchOp struct {
-	pointID  uint32
+type batchOpG1Affine struct {
 	bucketID uint16
+	point    G1Affine
 }
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
+func (o batchOpG1Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -33,7 +36,8 @@ func (o batchOp) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
+func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
 	points []G1Affine,
@@ -47,22 +51,13 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G1Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G1Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -76,76 +71,98 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG1Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG1Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
 			return
 		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -154,7 +171,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -164,40 +181,47 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -222,16 +246,44 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC4 [1 << (4 - 1)]G1Affine
-type bucketG1AffineC5 [1 << (5 - 1)]G1Affine
-type bucketG1AffineC8 [1 << (8 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
+// buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC4 |
-		bucketG1AffineC5 |
-		bucketG1AffineC8 |
-		bucketG1AffineC16
+	bucketG1AffineC16
+}
+
+// array of coordinates fp.Element
+type cG1Affine interface {
+	cG1AffineC16
+}
+
+// buckets: array of G1Affine points (for the batch addition)
+type pG1Affine interface {
+	pG1AffineC16
+}
+
+// buckets: array of *G1Affine points (for the batch addition)
+type ppG1Affine interface {
+	ppG1AffineC16
+}
+
+// buckets: array of G1Affine queue operations (for the batch addition)
+type qOpsG1Affine interface {
+	qOpsG1AffineC16
+}
+type cG1AffineC16 [640]fp.Element
+type pG1AffineC16 [640]G1Affine
+type ppG1AffineC16 [640]*G1Affine
+type qOpsG1AffineC16 [640]batchOpG1Affine
+
+type batchOpG2Affine struct {
+	bucketID uint16
+	point    G2Affine
+}
+
+func (o batchOpG2Affine) isNeg() bool {
+	return o.bucketID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -240,7 +292,8 @@ type ibG1Affine interface {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
+func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
 	points []G2Affine,
@@ -254,22 +307,13 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0      // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*G2Affine
+	var R TPP // bucket references
+	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
 
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]G2Affine
+	batchSize := len(P)
 
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
@@ -283,76 +327,98 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAddG2Affine(R[:cptAdd], P[:cptAdd])
+		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
 
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOpG2Affine) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
 			return
 		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()
+				}
+
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling .
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -361,7 +427,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -371,40 +437,47 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit >> 1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit >> 1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit >> 1)))
-			op.pointID += 1
+			bucketID -= 1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op
 			qID++
-			if qID == MAX_BATCH_SIZE-1 {
+
+			// queue is full, flush it.
+			if qID == len(queue)-1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -429,17 +502,36 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC4 [1 << (4 - 1)]G2Affine
-type bucketG2AffineC5 [1 << (5 - 1)]G2Affine
-type bucketG2AffineC8 [1 << (8 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
+// buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC4 |
-		bucketG2AffineC5 |
-		bucketG2AffineC8 |
-		bucketG2AffineC16
+	bucketG2AffineC16
+}
+
+// array of coordinates fp.Element
+type cG2Affine interface {
+	cG2AffineC16
+}
+
+// buckets: array of G2Affine points (for the batch addition)
+type pG2Affine interface {
+	pG2AffineC16
+}
+
+// buckets: array of *G2Affine points (for the batch addition)
+type ppG2Affine interface {
+	ppG2AffineC16
+}
+
+// buckets: array of G2Affine queue operations (for the batch addition)
+type qOpsG2Affine interface {
+	qOpsG2AffineC16
 }
+type cG2AffineC16 [640]fp.Element
+type pG2AffineC16 [640]G2Affine
+type ppG2AffineC16 [640]*G2Affine
+type qOpsG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index a4b3e9b5fd..6eb2c9f975 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -36,7 +36,39 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		}
 		return n - (c * (n / c))
 	}
+	batchSize := func(c int) int {
+		// nbBuckets := (1 << (c - 1))
+		// if c <= 12 {
+		// 	return nbBuckets/10 + 3*c
+		// }
+		// if c <= 14 {
+		// 	return nbBuckets/15
+		// }
+		// return nbBuckets / 20
+		// TODO @gbotrel / @yelhousni this need a better heuristic
+		// in theory, larger batch size == less inversions
+		// but if nbBuckets is small, then a large batch size will produce lots of collisions
+		// and queue ops.
+		// there is probably a cache-friendlyness factor at play here too.
+		switch c {
+		case 10:
+			return 80
+		case 11:
+			return 150
+		case 12:
+			return 200
+		case 13:
+			return 350
+		case 14:
+			return 400
+		case 15:
+			return 500
+		default:
+			return 640
+		}
+	}
 	funcs["lastC"] = lastC
+	funcs["batchSize"] = batchSize
 
 	funcs["contains"] = func(v int, s []int) bool {
 		for _, sv := range s {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 4655925e5e..c5b4eb675b 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -429,7 +429,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 		{{- if le $c 9}}
 			processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 		{{- else}}
-			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}]
+			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 		{{- end}}
 		{{- if eq $c $lc}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk)
@@ -437,7 +437,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
-				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}]
+				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 			{{- end}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index cf1c0ce71f..d19c51dca7 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -7,26 +7,31 @@
 {{ $G2TJacobianExtended := print (toLower .G2.PointName) "JacExtended" }}
 
 
-const MAX_BATCH_SIZE = 600
+import (
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
+)
+
 
-type batchOp struct {
-	pointID uint32
-	bucketID uint16
-}
 
-func (o batchOp) isNeg() bool {
-	return o.pointID&1 == 1
-}
 
 
 
-{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
-{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
+{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
+{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
 
 
 
 {{define "multiexp" }}
 
+type batchOp{{ $.TAffine }} struct {
+	bucketID uint16
+	point {{ $.TAffine }}
+}
+
+func (o batchOp{{ $.TAffine }}) isNeg() bool {
+	return o.bucketID&1 == 1
+}
 
 // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
@@ -34,7 +39,8 @@ func (o batchOp) isNeg() bool {
 // 
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](chunk uint64,
+func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}](
+	 chunk uint64,
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
 	 points []{{ $.TAffine }},
@@ -48,24 +54,16 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 
 	// setup for the batch affine;
 	// we do that instead of a separate object to give enough hints to the compiler to..
-	// keep things on the stack.
-	batchSize := len(buckets) / 20
-	if batchSize > MAX_BATCH_SIZE {
-		batchSize = MAX_BATCH_SIZE
-	}
-	if batchSize <= 0 {
-		batchSize = 1
-	}
 	var bucketIds BS // bitSet to signify presence of a bucket in current batch
 	cptAdd := 0 // count the number of bucket + point added to current batch
 
-	// bucket references
-	var R [MAX_BATCH_SIZE]*{{ $.TAffine }} 
-
-	// points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
-	var P [MAX_BATCH_SIZE]{{ $.TAffine }} 
-
-
+	
+	var R TPP // bucket references 
+	var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	
+	batchSize := len(P)
+	
+	
 	canAdd := func(bID uint16) bool {
 		return !bucketIds[bID]
 	}
@@ -78,78 +76,101 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 		if (cptAdd) == 0 {
 			return
 		}
-		batchAdd{{ $.TAffine }}(R[:cptAdd], P[:cptAdd])
+		batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd)
 		
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	add := func(op batchOp) {
+	addFromQueue := func(op batchOp{{$.TAffine}}) {
 		// CanAdd must be called before --> ensures bucket is not "used" in current batch
 	
 		BK := &buckets[op.bucketID]
-		PP := &points[op.pointID>>1]
-		if PP.IsInfinity() {
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
 			return
 		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+		
+	
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
+
+	add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) {
+		// CanAdd must be called before --> ensures bucket is not "used" in current batch
+	
+		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
-			if op.isNeg() {
-				BK.Neg(PP)
-			} else {
+			if isAdd {
 				BK.Set(PP)
+			} else {
+				BK.Neg(PP)
 			}
 			return
 		}
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
-				if op.isNeg() {
-					// P + -P
-					BK.setInfinity()
-					return
-				}
-				// P + P: doubling, which should be quite rare -- may want to put it back in the batch add?
+				// P + P: doubling, which should be quite rare --
 				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
 				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				if isAdd {
+					BK.Add(BK, BK)
+				} else {
+					BK.setInfinity()		
+				}
+				
 				return
 			}
-			// b.Y == -p.Y
-			if op.isNeg() {
-				// doubling . 
+			if isAdd {
+				BK.setInfinity()
+			} else {
 				BK.Add(BK, BK)
-				return
 			}
-			BK.setInfinity()
 			return
 		}
 		
 	
-		bucketIds[op.bucketID] = true
+		bucketIds[bucketID] = true
 		R[cptAdd] = BK
-		if op.isNeg() {
-			P[cptAdd].Neg(PP)
-		} else {
+		if isAdd {
 			P[cptAdd].Set(PP)
+		} else {
+			P[cptAdd].Neg(PP)
 		}
 		cptAdd++
 	}
 	
 
-	var queue [MAX_BATCH_SIZE]batchOp
+	var queue TQ
 	qID := 0
 
 	processQueue := func () {
 		for i := qID - 1; i >= 0; i-- {
-			if canAdd(queue[i].bucketID) {
-				add(queue[i])
-				if isFull() {
-					executeAndReset()
-				}
-				queue[i] = queue[qID-1]
-				qID--
+			if !canAdd(queue[i].bucketID) {
+				continue
+			}
+			addFromQueue(queue[i])
+			if isFull() {
+				executeAndReset()
 			}
+			queue[i] = queue[qID-1]
+			qID--
 		}
 	}
 
@@ -158,7 +179,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 			if !canAdd(queue[i].bucketID) {
 				return
 			}
-			add(queue[i])
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -168,40 +189,47 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 
 	for i, digit := range digits {
 
-		if digit == 0 {
+		if digit == 0 || points[i].IsInfinity() {
 			continue
 		}
 
-		op := batchOp{pointID: uint32(i) << 1}
-		// if msbWindow bit is set, we need to substract
-		if digit&1 == 0 {
+		bucketID := uint16((digit>>1))
+		isAdd := digit&1 == 0
+		if isAdd {
 			// add
-			op.bucketID = uint16((digit>>1) - 1)
-		} else {
-			// sub
-			op.bucketID = (uint16((digit>>1)))
-			op.pointID += 1
+			bucketID-=1
 		}
-		if canAdd(op.bucketID) {
-			add(op)
-			if isFull() {
-				executeAndReset()
-				processTopQueue()
+
+		if !canAdd(bucketID) {
+			// put it in queue
+			queue[qID].bucketID = bucketID
+			if isAdd {
+				queue[qID].point = points[i]
+			} else {
+				queue[qID].point.Neg(&points[i])
 			}
-		} else {
-			// put it in queue.
-			queue[qID] = op 
 			qID++
-			if qID == MAX_BATCH_SIZE - 1 {
+
+			// queue is full, flush it.
+			if qID == len(queue) - 1 {
 				executeAndReset()
 				processQueue()
 			}
+			continue
+		}
+
+		// we add the point to the batch.
+		add(bucketID, &points[i], isAdd)
+		if isFull() {
+			executeAndReset()
+			processTopQueue()
 		}
 	}
 
+	// empty the queue
 	for qID != 0 {
 		processQueue()
-		executeAndReset() // execute batch even if not full.
+		executeAndReset()
 	}
 
 	// flush items in batch.
@@ -227,15 +255,69 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet](c
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
 {{- range $c :=  $.CRange}}
+{{- if gt $c 9}}
 type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }}
 {{- end}}
+{{- end}}
+
 
+// buckets: array of {{ $.TAffine }} points of size 1 << (c-1)
 type ib{{ $.TAffine }} interface {
 	{{- range $i, $c :=  $.CRange}}
+	{{- if gt $c 9}}
 	bucket{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
 	{{- end}}
+	{{- end}}
+}
+
+// array of coordinates {{ $.CoordType }}
+type c{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	{{- if gt $c 9}}
+	c{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+	{{- end}}
+}
+
+// buckets: array of {{ $.TAffine }} points (for the batch addition)
+type p{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	{{- if gt $c 9}}
+	p{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+	{{- end}}
+}
+
+// buckets: array of *{{ $.TAffine }} points (for the batch addition)
+type pp{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	{{- if gt $c 9}}
+	pp{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+	{{- end}}
 }
 
+// buckets: array of {{ $.TAffine }} queue operations (for the batch addition)
+type qOps{{ $.TAffine }} interface {
+	{{- range $i, $c :=  $.CRange}}
+	{{- if gt $c 9}}
+	qOps{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	{{- end}}
+	{{- end}}
+}
+
+
+{{- range $c :=  $.CRange}}
+{{- if gt $c 9}}
+type c{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.CoordType }}
+type p{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.TAffine }}
+type pp{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]*{{ $.TAffine }}
+type qOps{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }}
+
+{{- end}}
+{{- end}}
+
+
 {{end }}
 
 {{- range $c :=  $.G1.CRange}}
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index 6e30ad9de8..c5455ff072 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1574,21 +1574,32 @@ func BatchScalarMultiplication{{ toUpper .PointName }}(base *{{ $TAffine }}, sca
 // batch add affine coordinates
 // using batch inversion
 // special cases (doubling, infinity) must be filtered out before this call
-func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) {
-	batchSize := len(R)
-	if batchSize == 0 {
-		return
-	}
-	var lambda, lambdain [MAX_BATCH_SIZE]{{.CoordType}}
+func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TAffine }}](R *TPP,P *TP, batchSize int) {
+	var lambda, lambdain TC
 
 
 	// add part
 	for j := 0; j < batchSize; j++ {
-		lambdain[j].Sub(&P[j].X, &R[j].X)
+		lambdain[j].Sub(&(*P)[j].X, &(*R)[j].X)
 	}
 
-	// invert denominator
-	batchInvert{{ $TAffine }}(lambda[:batchSize], lambdain[:batchSize])
+	// invert denominator using montgomery batch invert technique
+	{
+		var accumulator {{.CoordType}}
+		accumulator.SetOne()
+	
+		for i := 0; i < batchSize; i++ {
+			lambda[i] = accumulator
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	
+		accumulator.Inverse(&accumulator)
+	
+		for i := batchSize - 1; i >= 0; i-- {
+			lambda[i].Mul(&lambda[i], &accumulator)
+			accumulator.Mul(&accumulator, &lambdain[i])
+		}
+	}
 
 	var d {{.CoordType}}
 	var rr {{ $TAffine }}
@@ -1596,38 +1607,17 @@ func batchAdd{{ $TAffine }}(R []*{{ $TAffine }},P []{{ $TAffine }}) {
 	// add part
 	for j := 0; j < batchSize; j++ {
 		// computa lambda
-		d.Sub(&P[j].Y, &R[j].Y)
+		d.Sub(&(*P)[j].Y, &(*R)[j].Y)
 		lambda[j].Mul(&lambda[j], &d)
 
 		// compute X, Y
 		rr.X.Square(&lambda[j])
-		rr.X.Sub(&rr.X, &R[j].X)
-		rr.X.Sub(&rr.X, &P[j].X)
-		d.Sub(&R[j].X, &rr.X)
+		rr.X.Sub(&rr.X, &(*R)[j].X)
+		rr.X.Sub(&rr.X, &(*P)[j].X)
+		d.Sub(&(*R)[j].X, &rr.X)
 		rr.Y.Mul(&lambda[j], &d)
-		rr.Y.Sub(&rr.Y, &R[j].Y)
-		R[j].Set(&rr)
+		rr.Y.Sub(&rr.Y, &(*R)[j].Y)
+		(*R)[j].Set(&rr)
 	}
 }
 
-
-
-// batch inversion
-// similar to BatchInvert{{.CoordType}}, ignores edge cases
-func batchInvert{{ $TAffine }}(res, a []{{.CoordType}}) {
-
-    var accumulator {{.CoordType}}
-	accumulator.SetOne()
-
-	for i := 0; i < len(res); i++ {
-		res[i] = accumulator
-		accumulator.Mul(&accumulator, &a[i])
-	}
-
-	accumulator.Inverse(&accumulator)
-
-	for i := len(res) - 1; i >= 0; i-- {
-		res[i].Mul(&res[i], &accumulator)
-		accumulator.Mul(&accumulator, &a[i])
-	}
-}
\ No newline at end of file

From 2543ac331ad2d808ead1e960096fec15382217b2 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Mon, 14 Nov 2022 15:59:45 -0600
Subject: [PATCH 20/43] build: fix import in template

---
 ecc/bw6-633/multiexp_affine.go                          | 1 -
 ecc/bw6-756/multiexp_affine.go                          | 1 -
 ecc/bw6-761/multiexp_affine.go                          | 1 -
 internal/generator/ecc/template/multiexp_affine.go.tmpl | 4 +++-
 4 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index fd164fefd2..7d3323a044 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -18,7 +18,6 @@ package bw6633
 
 import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
-	"github.com/consensys/gnark-crypto/ecc/bw6-633/internal/fptower"
 )
 
 type batchOpG1Affine struct {
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index a9f8f12db5..739b3dca2e 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -18,7 +18,6 @@ package bw6756
 
 import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
-	"github.com/consensys/gnark-crypto/ecc/bw6-756/internal/fptower"
 )
 
 type batchOpG1Affine struct {
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 638f888c2a..d00ef26272 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -18,7 +18,6 @@ package bw6761
 
 import (
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
-	"github.com/consensys/gnark-crypto/ecc/bw6-761/internal/fptower"
 )
 
 type batchOpG1Affine struct {
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index d19c51dca7..ccfba7f2be 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -8,8 +8,10 @@
 
 
 import (
-	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/fp"
+	{{- if ne .G1.CoordType .G2.CoordType}}
+	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
+	{{- end}}
 )
 
 

From 5733bd23d67584088db377f3d36e61cc1d105c9a Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 08:54:51 -0600
Subject: [PATCH 21/43] feat: use nbBits+1 instead of nbWords*64 for
 partitionScalars

---
 ecc/bls12-377/multiexp.go                     | 134 ++++++++---------
 ecc/bls12-377/multiexp_affine.go              |  84 +++++++----
 ecc/bls12-377/multiexp_jacobian.go            |  12 +-
 ecc/bls12-378/multiexp.go                     | 138 +++++++++---------
 ecc/bls12-378/multiexp_affine.go              |  84 +++++++----
 ecc/bls12-378/multiexp_jacobian.go            |  12 +-
 ecc/bls12-381/multiexp.go                     |  76 +++++-----
 ecc/bls12-381/multiexp_affine.go              |  84 +++++++----
 ecc/bls24-315/multiexp.go                     | 134 ++++++++---------
 ecc/bls24-315/multiexp_affine.go              |  84 +++++++----
 ecc/bls24-315/multiexp_jacobian.go            |  12 +-
 ecc/bls24-317/multiexp.go                     |  76 +++++-----
 ecc/bls24-317/multiexp_affine.go              |  84 +++++++----
 ecc/bn254/multiexp.go                         | 138 +++++++++---------
 ecc/bn254/multiexp_affine.go                  |  84 +++++++----
 ecc/bn254/multiexp_jacobian.go                |  12 +-
 ecc/bw6-633/multiexp.go                       |  70 ++++-----
 ecc/bw6-633/multiexp_affine.go                |  12 +-
 ecc/bw6-633/multiexp_jacobian.go              |  12 +-
 ecc/bw6-756/multiexp.go                       |  70 ++++-----
 ecc/bw6-756/multiexp_affine.go                |  12 +-
 ecc/bw6-756/multiexp_jacobian.go              |  12 +-
 ecc/bw6-761/multiexp.go                       |  74 +++++-----
 ecc/bw6-761/multiexp_affine.go                |  12 +-
 ecc/bw6-761/multiexp_jacobian.go              |  16 +-
 internal/generator/ecc/generate.go            |   4 +-
 .../generator/ecc/template/multiexp.go.tmpl   |  38 ++---
 .../ecc/template/multiexp_affine.go.tmpl      |   8 +-
 28 files changed, 884 insertions(+), 704 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 9f2a1998fc..4ec5027cfb 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,53 +153,56 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -208,8 +211,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +343,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +354,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -401,53 +404,56 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -456,8 +462,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +549,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +624,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +632,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index c1d32b5ded..3e16fddca6 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E2
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E2
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E2
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E2
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E2
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E2
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E2
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index ae9aca6c47..e3c590196f 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC1 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC1 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index d65962591c..bf0a181fde 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,53 +153,54 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -208,8 +209,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +341,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +352,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -401,53 +402,54 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -456,8 +458,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +545,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +620,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +628,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index f060ffc11a..ed8968000e 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E2
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E2
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E2
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E2
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E2
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E2
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E2
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 0637114932..97a6ac8ac0 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index edcb161b5e..4e7c44d879 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -208,8 +208,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +340,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +351,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -456,8 +456,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +543,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +618,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +626,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index da6be7a817..f6fb9e2aac 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E2
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E2
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E2
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E2
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E2
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E2
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E2
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index ebaf6a86f7..828296544e 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,53 +153,56 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -208,8 +211,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +343,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +354,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -401,53 +404,56 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -456,8 +462,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +549,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +624,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +632,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 800c106b7d..135ccaf2b2 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E4
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E4
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E4
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E4
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E4
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E4
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E4
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 6e3ea0e2f9..9f01ed9a7a 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC1 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC1 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index c05f920246..c61cd372ec 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -174,31 +174,31 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -208,8 +208,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +340,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +351,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -422,31 +422,31 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
 	default:
 		panic("not implemented")
@@ -456,8 +456,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +543,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +618,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +626,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index f1fb40dea1..252a20acca 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E4
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E4
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E4
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E4
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E4
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E4
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E4
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 6989e58d4b..91d94501c6 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,53 +153,54 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 6:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qOpsG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qOpsG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
 		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qOpsG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qOpsG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
 		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qOpsG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qOpsG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -208,8 +209,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -340,7 +341,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,16 +352,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -401,53 +402,54 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 6:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 7:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 9:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qOpsG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qOpsG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
 		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qOpsG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qOpsG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
 		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qOpsG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qOpsG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -456,8 +458,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -543,8 +545,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -618,11 +620,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -631,17 +628,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 8eeded8aa1..7413da377f 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -300,42 +300,56 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC10 |
-		qOpsG1AffineC11 |
-		qOpsG1AffineC12 |
-		qOpsG1AffineC13 |
-		qOpsG1AffineC14 |
-		qOpsG1AffineC15 |
-		qOpsG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC11 |
+		qG1AffineC12 |
+		qG1AffineC13 |
+		qG1AffineC14 |
+		qG1AffineC15 |
+		qG1AffineC16
 }
+
+// batch size 80 when c = 10
 type cG1AffineC10 [80]fp.Element
 type pG1AffineC10 [80]G1Affine
 type ppG1AffineC10 [80]*G1Affine
-type qOpsG1AffineC10 [80]batchOpG1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
+// batch size 150 when c = 11
 type cG1AffineC11 [150]fp.Element
 type pG1AffineC11 [150]G1Affine
 type ppG1AffineC11 [150]*G1Affine
-type qOpsG1AffineC11 [150]batchOpG1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
+// batch size 200 when c = 12
 type cG1AffineC12 [200]fp.Element
 type pG1AffineC12 [200]G1Affine
 type ppG1AffineC12 [200]*G1Affine
-type qOpsG1AffineC12 [200]batchOpG1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
+// batch size 350 when c = 13
 type cG1AffineC13 [350]fp.Element
 type pG1AffineC13 [350]G1Affine
 type ppG1AffineC13 [350]*G1Affine
-type qOpsG1AffineC13 [350]batchOpG1Affine
+type qG1AffineC13 [350]batchOpG1Affine
+
+// batch size 400 when c = 14
 type cG1AffineC14 [400]fp.Element
 type pG1AffineC14 [400]G1Affine
 type ppG1AffineC14 [400]*G1Affine
-type qOpsG1AffineC14 [400]batchOpG1Affine
+type qG1AffineC14 [400]batchOpG1Affine
+
+// batch size 500 when c = 15
 type cG1AffineC15 [500]fp.Element
 type pG1AffineC15 [500]G1Affine
 type ppG1AffineC15 [500]*G1Affine
-type qOpsG1AffineC15 [500]batchOpG1Affine
+type qG1AffineC15 [500]batchOpG1Affine
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -616,42 +630,56 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC10 |
-		qOpsG2AffineC11 |
-		qOpsG2AffineC12 |
-		qOpsG2AffineC13 |
-		qOpsG2AffineC14 |
-		qOpsG2AffineC15 |
-		qOpsG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC11 |
+		qG2AffineC12 |
+		qG2AffineC13 |
+		qG2AffineC14 |
+		qG2AffineC15 |
+		qG2AffineC16
 }
+
+// batch size 80 when c = 10
 type cG2AffineC10 [80]fptower.E2
 type pG2AffineC10 [80]G2Affine
 type ppG2AffineC10 [80]*G2Affine
-type qOpsG2AffineC10 [80]batchOpG2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
+// batch size 150 when c = 11
 type cG2AffineC11 [150]fptower.E2
 type pG2AffineC11 [150]G2Affine
 type ppG2AffineC11 [150]*G2Affine
-type qOpsG2AffineC11 [150]batchOpG2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
+// batch size 200 when c = 12
 type cG2AffineC12 [200]fptower.E2
 type pG2AffineC12 [200]G2Affine
 type ppG2AffineC12 [200]*G2Affine
-type qOpsG2AffineC12 [200]batchOpG2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
+// batch size 350 when c = 13
 type cG2AffineC13 [350]fptower.E2
 type pG2AffineC13 [350]G2Affine
 type ppG2AffineC13 [350]*G2Affine
-type qOpsG2AffineC13 [350]batchOpG2Affine
+type qG2AffineC13 [350]batchOpG2Affine
+
+// batch size 400 when c = 14
 type cG2AffineC14 [400]fptower.E2
 type pG2AffineC14 [400]G2Affine
 type ppG2AffineC14 [400]*G2Affine
-type qOpsG2AffineC14 [400]batchOpG2Affine
+type qG2AffineC14 [400]batchOpG2Affine
+
+// batch size 500 when c = 15
 type cG2AffineC15 [500]fptower.E2
 type pG2AffineC15 [500]G2Affine
 type ppG2AffineC15 [500]*G2Affine
-type qOpsG2AffineC15 [500]batchOpG2Affine
+type qG2AffineC15 [500]batchOpG2Affine
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fptower.E2
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 288063d39a..9eaccec8eb 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -74,12 +74,12 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -153,12 +153,12 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index d329dacf85..b71b5a45b3 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -156,13 +156,16 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
+		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -171,8 +174,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -303,7 +306,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -314,16 +317,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -367,13 +370,16 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
+		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -382,8 +388,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -469,8 +475,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -544,11 +550,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -557,17 +558,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 7d3323a044..f25c59a8b6 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -269,12 +269,14 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC16
+	qG1AffineC16
 }
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -525,12 +527,14 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC16
+	qG2AffineC16
 }
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index e39d7fc165..d31a0eaf8c 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -65,9 +65,13 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC4 |
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC12 |
+		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
 		bucketg1JacExtendedC16
@@ -122,9 +126,13 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC4 |
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC12 |
+		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
 		bucketg2JacExtendedC16
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 719bca28bf..20bc87a829 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,17 +153,20 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
 		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
 		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -172,8 +175,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -304,7 +307,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -315,16 +318,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -365,17 +368,20 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
 		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
 		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -384,8 +390,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -471,8 +477,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -546,11 +552,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -559,17 +560,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 739b3dca2e..419acf811c 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -269,12 +269,14 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC16
+	qG1AffineC16
 }
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -525,12 +527,14 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC16
+	qG2AffineC16
 }
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 0cba708584..86ccb23bbc 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -65,9 +65,13 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC4 |
+	bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC11 |
+		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
 		bucketg1JacExtendedC16
@@ -122,9 +126,13 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC4 |
+	bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC11 |
+		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
 		bucketg2JacExtendedC16
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 1fce2c8080..e482857188 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,16 +103,16 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -153,17 +153,20 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
+		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
 		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
+		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qOpsG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -172,8 +175,8 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -304,7 +307,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -315,16 +318,16 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit * 2
@@ -365,17 +368,20 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 
 	case 4:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
+		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 5:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
 		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 8:
 		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
+		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qOpsG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
 	}
@@ -384,8 +390,8 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := (fr.Limbs * 64 / c) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -471,8 +477,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -546,11 +552,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue
-				}
-
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
 				if digit >= max {
@@ -559,17 +560,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index d00ef26272..4f039c26bc 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -269,12 +269,14 @@ type ppG1Affine interface {
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qOpsG1AffineC16
+	qG1AffineC16
 }
+
+// batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
 type ppG1AffineC16 [640]*G1Affine
-type qOpsG1AffineC16 [640]batchOpG1Affine
+type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
@@ -525,12 +527,14 @@ type ppG2Affine interface {
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qOpsG2AffineC16
+	qG2AffineC16
 }
+
+// batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
-type qOpsG2AffineC16 [640]batchOpG2Affine
+type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index af2d68b853..3039c09d6c 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -65,9 +65,15 @@ type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
+type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC4 |
+	bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
+		bucketg1JacExtendedC10 |
+		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
 		bucketg1JacExtendedC16
@@ -122,9 +128,15 @@ type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
+type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC4 |
+	bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
+		bucketg2JacExtendedC10 |
+		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
 		bucketg2JacExtendedC16
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 6eb2c9f975..f77b9d5ca8 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -28,9 +28,7 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		return x == reflect.ValueOf(a).Len()-1
 	}
 	lastC := func(c int) int {
-		// lastC := (fr.Limbs * 64) - (c * (fr.Limbs * 64 / c))
-		// if c divides fr.Limbs * 64;
-		n := (conf.Fr.NbWords * 64)
+		n := (conf.Fr.NbBits + 1) // +1 for the potential carry of the NAF decomposition
 		if n%c == 0 {
 			return c
 		}
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index c5b4eb675b..1e3e454dbe 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -42,8 +42,8 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := fr.Limbs * 64 / c
-	if (fr.Limbs * 64)%c != 0 {
+	nbChunks := (fr.Bits+1) / c
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
@@ -118,11 +118,6 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh
 				}
 
-				// if digit is zero, no impact on result
-				if digit == 0 {
-					continue 
-				}
-
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
@@ -132,17 +127,16 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 
 				var bits uint16
-				if digit >= 0 {
+
+				// if digit is zero, no impact on result
+				if digit == 0 {
+					continue 
+				} else if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
 				toReturn[int(chunk)*len(scalars)+i] = bits
-				// [s.index] |= (bits << s.shift)
-				// if s.multiWordSelect {
-				// 	toReturn[i][s.index+1] |= (bits >> s.shiftHigh)
-				// }
-
 			}
 		}
 
@@ -361,7 +355,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := fr.Limbs * 64 * (nbPoints + (1 << (c)))
+			cc := (fr.Bits+1) * (nbPoints + (1 << (c)))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -372,16 +366,16 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int(fr.Limbs * 64 / C) // number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%C != 0 {
+	nbChunks := int((fr.Bits+1) / C) // number of c-bit radixes in a scalar
+	if (fr.Bits+1)%C != 0 {
 		nbChunks++
 	}
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints/2)
-		nbChunksPostSplit := int(fr.Limbs * 64 / cSplit)
-		if (fr.Limbs*64)%cSplit != 0 {
+		nbChunksPostSplit := int((fr.Bits+1) / cSplit)
+		if (fr.Bits+1)%cSplit != 0 {
 			nbChunksPostSplit++
 		}
 		nbTasksPostSplit := nbChunksPostSplit*2
@@ -429,7 +423,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 		{{- if le $c 9}}
 			processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 		{{- else}}
-			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
+			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 		{{- end}}
 		{{- if eq $c $lc}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk)
@@ -437,7 +431,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
-				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, qOps{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
+				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 			{{- end}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
@@ -450,8 +444,8 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} {
 	
-	nbChunks := (fr.Limbs * 64 / c) 			// number of c-bit radixes in a scalar
-	if (fr.Limbs*64)%c != 0 {
+	nbChunks := ((fr.Bits+1) / c) 			// number of c-bit radixes in a scalar
+	if (fr.Bits+1)%c != 0 {
 		nbChunks++
 	}
 
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index ccfba7f2be..312ea0897f 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -303,19 +303,19 @@ type pp{{ $.TAffine }} interface {
 type qOps{{ $.TAffine }} interface {
 	{{- range $i, $c :=  $.CRange}}
 	{{- if gt $c 9}}
-	qOps{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
+	q{{ $.TAffine }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
 	{{- end}}
 	{{- end}}
 }
 
 
 {{- range $c :=  $.CRange}}
-{{- if gt $c 9}}
+{{if gt $c 9}}
+// batch size {{batchSize $c}} when c = {{$c}}
 type c{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.CoordType }}
 type p{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]{{ $.TAffine }}
 type pp{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]*{{ $.TAffine }}
-type qOps{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }}
-
+type q{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }}
 {{- end}}
 {{- end}}
 

From 533743e184b86c2f62ede7e743149107f5e28d71 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 09:30:57 -0600
Subject: [PATCH 22/43] style: cosmetics

---
 ecc/bls12-377/multiexp.go                     | 60 ++++++++-----------
 ecc/bls12-378/multiexp.go                     | 56 ++++++++---------
 ecc/bls12-381/multiexp.go                     | 52 +++++++---------
 ecc/bls24-315/multiexp.go                     | 60 ++++++++-----------
 ecc/bls24-317/multiexp.go                     | 52 +++++++---------
 ecc/bn254/multiexp.go                         | 56 ++++++++---------
 ecc/bw6-633/multiexp.go                       | 56 ++++++++---------
 ecc/bw6-756/multiexp.go                       | 56 ++++++++---------
 ecc/bw6-761/multiexp.go                       | 56 ++++++++---------
 .../generator/ecc/template/multiexp.go.tmpl   | 36 +++++------
 10 files changed, 234 insertions(+), 306 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 4ec5027cfb..f6ea24d92c 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -197,11 +192,11 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -211,10 +206,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -343,7 +335,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -354,18 +346,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -448,11 +435,11 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -462,10 +449,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -539,6 +523,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -549,10 +544,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index bf0a181fde..9f16ef2b91 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -199,7 +194,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -209,10 +204,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -341,7 +333,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -352,18 +344,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -448,7 +435,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -458,10 +445,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -535,6 +519,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -545,10 +540,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 4e7c44d879..a9d35fd9aa 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -208,10 +203,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -340,7 +332,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,18 +343,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -456,10 +443,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -533,6 +517,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -543,10 +538,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 828296544e..970cd52000 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -197,11 +192,11 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -211,10 +206,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -343,7 +335,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -354,18 +346,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -448,11 +435,11 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 15:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -462,10 +449,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -539,6 +523,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -549,10 +544,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index c61cd372ec..52f2fe363f 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -208,10 +203,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -340,7 +332,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -351,18 +343,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -456,10 +443,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -533,6 +517,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -543,10 +538,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 91d94501c6..dd3d4c2183 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -199,7 +194,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -209,10 +204,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -341,7 +333,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -352,18 +344,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -448,7 +435,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -458,10 +445,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -535,6 +519,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -545,10 +540,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index b71b5a45b3..b16ddb3d18 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -164,7 +159,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -174,10 +169,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -306,7 +298,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -317,18 +309,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -378,7 +365,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -388,10 +375,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -465,6 +449,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -475,10 +470,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 20bc87a829..be86f40a8f 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -165,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -175,10 +170,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -307,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -318,18 +310,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -380,7 +367,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -390,10 +377,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -467,6 +451,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -477,10 +472,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index e482857188..7272a5efef 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -92,7 +92,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -103,18 +103,13 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -165,7 +160,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -175,10 +170,7 @@ func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config
 func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -307,7 +299,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits + 1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits + 1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -318,18 +310,13 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits + 1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints / 2)
-		nbChunksPostSplit := int((fr.Bits + 1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit * 2
 		if (nbTasksPostSplit <= config.NbTasks/2) || (nbTasksPostSplit-config.NbTasks/2) <= (config.NbTasks-nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -380,7 +367,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	case 16:
 		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
 	default:
 		panic("not implemented")
@@ -390,10 +377,7 @@ func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config
 func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
 
-	nbChunks := ((fr.Bits + 1) / c) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
@@ -467,6 +451,17 @@ type selector struct {
 	shiftHigh       uint64 // same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits + 1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -477,10 +472,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 1e3e454dbe..7066a005af 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -32,6 +32,17 @@ type selector struct {
 	shiftHigh uint64		// same than shift, for index+1
 }
 
+// return number of chunks for a given window size c
+func computeNbChunks(c uint64) uint64 {
+	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF 
+	// decomposition in partitionScalars
+	nbChunks := (fr.Bits+1) / c
+	if (fr.Bits+1)%c != 0 {
+		nbChunks++
+	}
+	return nbChunks
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -42,10 +53,7 @@ type selector struct {
 // 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
 func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
 	// number of c-bit radixes in a scalar
-	nbChunks := (fr.Bits+1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	toReturn := make([]uint16, len(scalars)*int(nbChunks))
 
@@ -355,7 +363,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		// for example, on a MBP 2016, for G2 MultiExp > 8M points, hand picking c gives better results
 		min := math.MaxFloat64
 		for _, c := range implementedCs {
-			cc := (fr.Bits+1) * (nbPoints + (1 << (c)))
+			cc := (fr.Bits+1) * (nbPoints + (1 << c))
 			cost := float64(cc) / float64(c)
 			if cost < min {
 				min = cost
@@ -366,18 +374,13 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	}
 
 	C := bestC(nbPoints)
-	nbChunks := int((fr.Bits+1) / C) // number of c-bit radixes in a scalar
-	if (fr.Bits+1)%C != 0 {
-		nbChunks++
-	}
+	nbChunks := int(computeNbChunks(C))
+
 	// if we don't utilise all the tasks (CPU in the default case) that we could, let's see if it's worth it to split
 	if config.NbTasks > 1 && nbChunks < config.NbTasks {
 		// before spliting, let's see if we endup with more tasks than thread;
 		cSplit := bestC(nbPoints/2)
-		nbChunksPostSplit := int((fr.Bits+1) / cSplit)
-		if (fr.Bits+1)%cSplit != 0 {
-			nbChunksPostSplit++
-		}
+		nbChunksPostSplit := int(computeNbChunks(cSplit))
 		nbTasksPostSplit := nbChunksPostSplit*2
 		if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) {
 			// if postSplit we still have less tasks than available CPU
@@ -431,7 +434,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 			{{- if le $lc 9}}
 				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
 			{{- else}}
-				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
+				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{lastC $c}}, p{{$.TAffine}}C{{lastC $c}}, pp{{$.TAffine}}C{{lastC $c}}, q{{$.TAffine}}C{{lastC $c}}, c{{$.TAffine}}C{{lastC $c}}]
 			{{- end}}
 			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
@@ -444,10 +447,7 @@ func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffi
 func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool,
 	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} {
 	
-	nbChunks := ((fr.Bits+1) / c) 			// number of c-bit radixes in a scalar
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
+	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window

From 59eb243242b8b7fb4cb2397e9466ac2d9c8df08b Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 10:37:08 -0600
Subject: [PATCH 23/43] style: code cleaning

---
 ecc/bls12-377/multiexp.go                     | 209 +++++++++---------
 ecc/bls12-377/multiexp_affine.go              | 154 +++----------
 ecc/bls12-377/multiexp_test.go                |  31 +--
 ecc/bls12-378/multiexp.go                     | 205 +++++++++--------
 ecc/bls12-378/multiexp_affine.go              | 154 +++----------
 ecc/bls12-378/multiexp_test.go                |  31 +--
 ecc/bls12-381/multiexp.go                     | 203 +++++++++--------
 ecc/bls12-381/multiexp_affine.go              | 154 +++----------
 ecc/bls12-381/multiexp_test.go                |  31 +--
 ecc/bls24-315/multiexp.go                     | 209 +++++++++---------
 ecc/bls24-315/multiexp_affine.go              | 154 +++----------
 ecc/bls24-315/multiexp_test.go                |  31 +--
 ecc/bls24-317/multiexp.go                     | 203 +++++++++--------
 ecc/bls24-317/multiexp_affine.go              | 154 +++----------
 ecc/bls24-317/multiexp_test.go                |  31 +--
 ecc/bn254/multiexp.go                         | 205 +++++++++--------
 ecc/bn254/multiexp_affine.go                  | 154 +++----------
 ecc/bn254/multiexp_test.go                    |  31 +--
 ecc/bw6-633/multiexp.go                       |  99 +++++----
 ecc/bw6-633/multiexp_affine.go                | 154 +++----------
 ecc/bw6-633/multiexp_test.go                  |  31 +--
 ecc/bw6-756/multiexp.go                       | 101 +++++----
 ecc/bw6-756/multiexp_affine.go                | 154 +++----------
 ecc/bw6-756/multiexp_test.go                  |  31 +--
 ecc/bw6-761/multiexp.go                       | 101 +++++----
 ecc/bw6-761/multiexp_affine.go                | 154 +++----------
 ecc/bw6-761/multiexp_test.go                  |  31 +--
 .../generator/ecc/template/multiexp.go.tmpl   |  67 +++---
 .../ecc/template/multiexp_affine.go.tmpl      |  78 ++-----
 .../ecc/template/tests/multiexp.go.tmpl       |  16 +-
 30 files changed, 1254 insertions(+), 2107 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index f6ea24d92c..cf970c5246 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -128,83 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -221,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -231,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -371,83 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -464,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -474,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -534,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 3e16fddca6..8f33a80438 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 4b4406e922..347fb0ab6b 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 9f16ef2b91..91f7ede410 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -128,81 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -219,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -229,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -369,81 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -460,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -470,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -530,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index ed8968000e..09ee239d09 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index b710acf39b..90b1321a9a 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index a9d35fd9aa..191875391e 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -128,80 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -218,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -228,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -368,80 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -458,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -468,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -528,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index f6fb9e2aac..f452ac210e 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 1f8539c0bf..5b0b8eb7cc 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 970cd52000..894b89a73d 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -128,83 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -221,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -231,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -371,83 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -464,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -474,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -534,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 135ccaf2b2..fa683c78a1 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 9307ba079d..9bda4cefd3 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 52f2fe363f..7cd831c238 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -128,80 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -218,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -228,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -368,80 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -458,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -468,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -528,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 252a20acca..913b2e9308 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 5945e42e8a..c166598a34 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index dd3d4c2183..9b3140de09 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -128,81 +128,79 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC6]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC7]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC9]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		_innerMsmG1(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		_innerMsmG1(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		_innerMsmG1(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -219,9 +217,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -229,6 +229,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -369,81 +370,79 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC6]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 6, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 7, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC7]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC9]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 9, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		_innerMsmG2(p, 10, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 11, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 12, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		_innerMsmG2(p, 13, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 14, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		_innerMsmG2(p, 15, points, digits, splitFirstChunk, processChunk, processChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -460,9 +459,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -470,6 +471,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -530,6 +532,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 7413da377f..8f6e9073b8 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -50,64 +50,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -149,15 +115,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -166,19 +129,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -192,7 +142,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -214,7 +164,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -380,64 +329,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -479,15 +394,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -496,19 +408,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -522,7 +421,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -544,7 +443,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 23dc3b5897..5962e0b859 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index b16ddb3d18..71170c6cf9 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -128,46 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC1]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -184,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -194,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -334,46 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC1]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -390,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -400,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -460,6 +460,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index f25c59a8b6..870483e934 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 4c40debed6..45bef3125a 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index be86f40a8f..28672b9219 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -128,47 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -185,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -195,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -335,47 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -392,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -402,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -462,6 +460,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 419acf811c..ca6e7c172a 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index d79044f69c..57956e9c5a 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 7272a5efef..cc02ad9b57 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -128,47 +128,43 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG1(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG1(p *G1Jac, c int, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC4]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC5]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC3]
-		_innerMsmG1(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC8]
-		processLastChunk := processChunkG1Jacobian[bucketg1JacExtendedC2]
-		_innerMsmG1(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		processChunk := processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-		processLastChunk := processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-		_innerMsmG1(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16)) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -185,9 +181,11 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG1(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG1(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -195,6 +193,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG1(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -335,47 +334,43 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	innerMsmG2(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-func innerMsmG2(p *G2Jac, c int, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+
+	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	mustBeExt := false
 	switch c {
 
 	case 4:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC4]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 4, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC5]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC3]
-		_innerMsmG2(p, 5, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC8]
-		processLastChunk := processChunkG2Jacobian[bucketg2JacExtendedC2]
-		_innerMsmG2(p, 8, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		processChunk := processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-		processLastChunk := processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-		_innerMsmG2(p, 16, points, digits, splitFirstChunk, processChunk, processLastChunk)
+		if mustBeExt {
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
-		panic("not implemented")
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16)) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -392,9 +387,11 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessorG2(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j > 0; j-- {
+		processChunk := getChunkProcessorG2(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -402,6 +399,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessorG2(c)
 		if !splitFirstChunk {
 			go processChunk(0, chChunks[0], c, points, digits[:n])
 		} else {
@@ -462,6 +460,17 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 4f039c26bc..3b653b6563 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -49,64 +49,30 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG1Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -148,15 +114,12 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -165,19 +128,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -191,7 +141,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -213,7 +163,6 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
@@ -307,64 +256,30 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0      // count the number of bucket + point added to current batch
-
-	var R TPP // bucket references
-	var P TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS  // bitSet to signify presence of a bucket in current batch
+		cptAdd    int // count the number of bucket + point added to current batch
+		R         TPP // bucket references
+		P         TP  // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue     TQ  // queue of points that conflict the current batch
+		qID       int // current position in queue
+	)
 
 	batchSize := len(P)
 
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func() {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
-
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOpG2Affine) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
-
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -406,15 +321,12 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
-	var queue TQ
-	qID := 0
-
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -423,19 +335,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
-
 	for i, digit := range digits {
 
 		if digit == 0 || points[i].IsInfinity() {
@@ -449,7 +348,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 			bucketID -= 1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -471,7 +370,6 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 2dcc22a913..968613803a 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -21,7 +21,6 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -91,8 +90,7 @@ func TestMultiExpG1(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG1(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -127,8 +125,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -163,8 +162,9 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG1(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -231,7 +231,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -373,8 +373,7 @@ func TestMultiExpG2(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsmG2(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -407,8 +406,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -443,8 +443,9 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i, c := range cRange {
-				innerMsmG2(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			for i := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -511,7 +512,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 7066a005af..ef2b67b636 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -43,6 +43,18 @@ func computeNbChunks(c uint64) uint64 {
 	return nbChunks
 }
 
+// return the last window size for a scalar; if c divides the scalar size 
+// then it returns c
+// if not, returns lastC << c
+func lastC(c uint64) uint64 {
+	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
+	if n%c == 0 {
+		return c
+	}
+	return n - (c * (n / c))
+}
+
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -399,53 +411,43 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		}
 	}
 
-	innerMsm{{ $.UPointName }}(p, int(C), points, scalars, config)
-
-	return p, nil
-}
-
-
-func innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c int, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig)  {
-
 	// partition the scalars
 	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
 	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
 	// var smallValues int 
-	digits, smallValues := partitionScalars(scalars, uint64(c), config.ScalarsMont, config.NbTasks)
+	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
 	// if we have more than 10% of small values, we split the processing of the first chunk in 2
 	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
 	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
 
-	{{- /* TODO @gbotrel need to deal with cases where lastC == 1 ; having a whole chunk with 1-bit window makes no sense */}}
-	{{- /* also need to determine until which window size the ext-jacobian version is worth it. */}}
+	_innerMsm{{ $.UPointName }}(p, C, points, digits, splitFirstChunk)
+
+	return p, nil
+}
+
+
+func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
+	mustBeExt := false
 	switch c {
-	{{range $c :=  $.CRange}}
-	{{- $lc := lastC $c}}
-	case {{$c}}:
-		{{- if le $c 9}}
-			processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-		{{- else}}
-			processChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
-		{{- end}}
-		{{- if eq $c $lc}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processChunk)
-		{{- else}}
-			{{- if le $lc 9}}
-				processLastChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{lastC $c}}]
+		{{range $c :=  $.CRange}}
+		case {{$c}}:
+			{{- if le $c 9}}
+				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 			{{- else}}
-				processLastChunk := processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{lastC $c}}, bitSetC{{lastC $c}}, p{{$.TAffine}}C{{lastC $c}}, pp{{$.TAffine}}C{{lastC $c}}, q{{$.TAffine}}C{{lastC $c}}, c{{$.TAffine}}C{{lastC $c}}]
+				if mustBeExt {
+					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+				}
+				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 			{{- end}}
-			_innerMsm{{ $.UPointName }}(p, {{$c}}, points, digits, splitFirstChunk, processChunk, processLastChunk)
 		{{- end}}
-	{{- end}}
-	default:
-		panic("not implemented")
+		default:
+			// panic("will not happen c != previous values is not generated by templates")
+			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16]
 	}
 }
 
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool,
-	processChunk, processLastChunk func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16)) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool) *{{ $.TJacobian }} {
 	
 	nbChunks := computeNbChunks(c)
 
@@ -462,9 +464,11 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
+	processLastChunk := getChunkProcessor{{ $.UPointName }}(lastC(c))
 	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
 	for j := int(nbChunks - 2); j >0; j-- {
+		processChunk := getChunkProcessor{{ $.UPointName }}(c)
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -472,6 +476,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
 	// in the ~same amount of time
 	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+		processChunk := getChunkProcessor{{ $.UPointName }}(c)
 		if !splitFirstChunk {
 			go processChunk(0,chChunks[0], c, points, digits[:n])
 		} else {
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 312ea0897f..8462321866 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -55,67 +55,32 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 	}
 
 	// setup for the batch affine;
-	// we do that instead of a separate object to give enough hints to the compiler to..
-	var bucketIds BS // bitSet to signify presence of a bucket in current batch
-	cptAdd := 0 // count the number of bucket + point added to current batch
-
-	
-	var R TPP // bucket references 
-	var P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+	var (
+		bucketIds BS // bitSet to signify presence of a bucket in current batch
+		cptAdd int // count the number of bucket + point added to current batch
+		R TPP // bucket references 
+		P TP // points to be added to R (buckets); it is beneficial to store them on the stack (ie copy)
+		queue TQ // queue of points that conflict the current batch
+		qID int // current position in queue
+	)
 	
 	batchSize := len(P)
 	
 	
-	canAdd := func(bID uint16) bool {
-		return !bucketIds[bID]
-	}
-
 	isFull := func() bool {
-		return (cptAdd) == batchSize
+		return cptAdd == batchSize
 	}
 
 	executeAndReset := func ()  {
-		if (cptAdd) == 0 {
-			return
-		}
 		batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd)
-		
 		var tmp BS
 		bucketIds = tmp
 		cptAdd = 0
 	}
 
-	addFromQueue := func(op batchOp{{$.TAffine}}) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-	
-		BK := &buckets[op.bucketID]
-		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
-			BK.Set(&op.point)
-			return
-		}
-		if BK.X.Equal(&op.point.X) {
-			if BK.Y.Equal(&op.point.Y) {
-				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
-				return
-			}
-			BK.setInfinity()
-			return
-		}
-		
-	
-		bucketIds[op.bucketID] = true
-		R[cptAdd] = BK
-		P[cptAdd] = op.point
-		cptAdd++
-	}
 
 	add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) {
-		// CanAdd must be called before --> ensures bucket is not "used" in current batch
-	
+		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
 		if BK.IsInfinity() {
@@ -147,7 +112,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 			return
 		}
 		
-	
 		bucketIds[bucketID] = true
 		R[cptAdd] = BK
 		if isAdd {
@@ -157,17 +121,14 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		}
 		cptAdd++
 	}
-	
 
-	var queue TQ
-	qID := 0
 
 	processQueue := func () {
 		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
+			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			addFromQueue(queue[i])
+			add(queue[i].bucketID, &queue[i].point, true)
 			if isFull() {
 				executeAndReset()
 			}
@@ -176,18 +137,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		}
 	}
 
-	processTopQueue := func() {
-		for i := qID - 1; i >= 0; i-- {
-			if !canAdd(queue[i].bucketID) {
-				return
-			}
-			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			qID--
-		}
-	}
 
 	for i, digit := range digits {
 
@@ -202,7 +151,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 			bucketID-=1
 		}
 
-		if !canAdd(bucketID) {
+		if bucketIds[bucketID] {
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
@@ -224,7 +173,6 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processTopQueue()
 		}
 	}
 
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 070481bf7b..455142ceb6 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -10,7 +10,6 @@
 import (
 	"fmt"
     "time"
-	"runtime"
     "math/rand"
 	"math/big"
 	"testing"
@@ -90,8 +89,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 					FromMont()
 			}
 
-			innerMsm{{ toUpper $.PointName }}(&r16, 16, samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
-
+			r16.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{})
 			splitted1.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 128})
 			splitted2.MultiExp(samplePointsLarge[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: 51})
 			return r16.Equal(&splitted1) && r16.Equal(&splitted2)
@@ -135,8 +133,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 
 
 			results := make([]{{ $.TJacobian }}, len(cRange))
-			for i, c := range cRange {
-				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
+			for i, _ := range cRange {
+				// TODO @gbotrel restore test of all C
+				results[i].MultiExp( samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i:=1; i < len(results);i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -171,8 +170,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 
 			results := make([]{{ $.TJacobian }}, len(cRange))
-			for i, c := range cRange {
-				innerMsm{{ toUpper $.PointName }}(&results[i], int(c), samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks:runtime.NumCPU()})
+			for i, _ := range cRange {
+				// TODO @gbotrel restore test for all C
+				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -245,7 +245,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 5; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {

From 555ca0d4e0580b290e9e06325bcb339a86fe8bbb Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 11:40:57 -0600
Subject: [PATCH 24/43] feat: added chunkStats instead of small values

---
 ecc/bls12-377/multiexp.go                     | 158 ++++++++----------
 ecc/bls12-378/multiexp.go                     | 158 ++++++++----------
 ecc/bls12-381/multiexp.go                     | 158 ++++++++----------
 ecc/bls24-315/multiexp.go                     | 158 ++++++++----------
 ecc/bls24-317/multiexp.go                     | 158 ++++++++----------
 ecc/bn254/multiexp.go                         | 158 ++++++++----------
 ecc/bw6-633/multiexp.go                       | 158 ++++++++----------
 ecc/bw6-756/multiexp.go                       | 158 ++++++++----------
 ecc/bw6-761/multiexp.go                       | 158 ++++++++----------
 .../generator/ecc/template/multiexp.go.tmpl   | 115 ++++++-------
 10 files changed, 645 insertions(+), 892 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index cf970c5246..b04f523cff 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 91f7ede410..cd49deb3de 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 191875391e..4d7a2e07f5 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 894b89a73d..3f49fd1afd 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 7cd831c238..29548cc89c 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 9b3140de09..87cee65e19 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -200,7 +193,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -217,35 +210,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -371,16 +358,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -442,7 +422,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -459,35 +439,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -543,24 +517,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -579,36 +554,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -637,26 +598,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 71170c6cf9..d77d85346f 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -471,24 +445,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 28672b9219..ae313e1d52 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -471,24 +445,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index cc02ad9b57..1c7ae6f672 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -129,16 +129,9 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsmG1(p, C, points, digits, splitFirstChunk)
+	_innerMsmG1(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -164,7 +157,7 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFirstChunk bool) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -181,35 +174,29 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG1(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG1(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG1(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
@@ -335,16 +322,9 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, splitFirstChunk)
+	_innerMsmG2(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -370,7 +350,7 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFirstChunk bool) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
 
 	nbChunks := computeNbChunks(c)
 
@@ -387,35 +367,29 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, splitFi
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessorG2(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
 
-	for j := int(nbChunks - 2); j > 0; j-- {
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessorG2(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
-		processChunk := getChunkProcessorG2(c)
-		if !splitFirstChunk {
-			go processChunk(0, chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
-			go processChunk(0, chSplit, c, points[:split], digits[:split])
-			go processChunk(0, chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
+			continue
 		}
-
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
@@ -471,24 +445,25 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
-	max := int(1 << (c - 1))    // max value we want for our digits
-	cDivides64 := (64 % c) == 0 // if c doesn't divide 64, we may need to select over multiple words
+	max := int(1 << (c - 1))     // max value we want for our digits
+	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
 	selectors := make([]selector, nbChunks)
@@ -507,36 +482,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks)
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i := start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -565,26 +526,39 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
-				} else if digit > 0 {
+				}
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues += o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i] += nbOps
+			totalOps += nbOps
+		}
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index ef2b67b636..4aa916fcfc 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -54,6 +54,10 @@ func lastC(c uint64) uint64 {
 	return n - (c * (n / c))
 }
 
+type chunkStat struct {
+	weight float32 // relative weight compared to other chunks.
+}
+
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
@@ -61,16 +65,13 @@ func lastC(c uint64) uint64 {
 // negative digits can be processed in a later step as adding -G into the bucket instead of G
 // (computing -G is cheap, and this saves us half of the buckets in the MultiExp or BatchScalarMultiplication)
 // scalarsMont indicates wheter the provided scalars are in montgomery form
-// returns smallValues, which represent the number of scalars which meets the following condition
-// 0 < scalar < 2^c (in other words, scalars where only the c-least significant bits are non zero)
-func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, int) {
+func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks int) ([]uint16, []chunkStat) {
 	// number of c-bit radixes in a scalar
 	nbChunks := computeNbChunks(c)
 
-	toReturn := make([]uint16, len(scalars)*int(nbChunks))
+	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask  := uint64((1 << c) - 1) 		// low c bits are 1
-	// msbWindow := uint64(1 << (c -1)) 			// msb of the c-bit window
 	max := int(1 << (c -1)) 					// max value we want for our digits
 	cDivides64 :=  (64 %c ) == 0 				// if c doesn't divide 64, we may need to select over multiple words
 
@@ -92,36 +93,22 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	// for each chunk, we could track the number of non-zeros points we will need to process
-	// this way, if a chunk has more work to do than others, we can spawn off more go routines
-	// (at the cost of more buckets allocated)
-	// a simplified approach is to track the small values where only the first word is set
-	// if this number represent a significant number of points, then we will split first chunk
-	// processing in the msm in 2, to ensure all go routines finish at ~same time
-	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
-	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks) 
+	chOpsPerChunk := make(chan []int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		smallValues := 0
+		opsPerChunk := make([]int, nbChunks)
 		for i:=start; i < end; i++ {
-			var carry int
-
 			scalar := scalars[i]
 			if scalarsMont {
 				scalar.FromMont()
 			}
-			if scalar.FitsOnOneWord() {
+			if scalar.IsZero() {
 				// everything is 0, no need to process this scalar
-				if scalar[0] == 0 {
-					continue
-				}
-				// low c-bits are 1 in mask
-				if scalar[0]&mask == scalar[0] {
-					smallValues++
-				}
+				continue
 			}
 
+			var carry int
+
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
 			for chunk := uint64(0); chunk < nbChunks; chunk++ {
 				s := selectors[chunk]
@@ -151,27 +138,40 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue 
-				} else if digit > 0 {
+				} 
+				if digit > 0 {
 					bits = uint16(digit) << 1
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
 				}
-				toReturn[int(chunk)*len(scalars)+i] = bits
+				digits[int(chunk)*len(scalars)+i] = bits
+				opsPerChunk[chunk]++
 			}
 		}
 
-		chSmallValues <- smallValues
+		chOpsPerChunk <- opsPerChunk
 
 	}, nbTasks)
 	
 	
-	// aggregate small values
-	close(chSmallValues)
-	smallValues := 0
-	for o := range chSmallValues {
-		smallValues+=o
+	// aggregate  chunk stats
+	close(chOpsPerChunk)
+	opsPerChunk := make([]int, nbChunks)
+	totalOps := 0
+	for o := range chOpsPerChunk {
+		for i, nbOps := range o {
+			opsPerChunk[i]+=nbOps
+			totalOps += nbOps
+		} 
 	}
-	return toReturn, smallValues
+	chunkStats := make([]chunkStat, nbChunks)
+	target := float32(totalOps) / float32(nbChunks)
+	// what percentage are you of the target
+	for i := 0; i < len(chunkStats); i++ {
+		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	}
+
+	return digits, chunkStats
 }
 
 
@@ -412,16 +412,9 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	}
 
 	// partition the scalars
-	// note: we do that before the actual chunk processing, as for each c-bit window (starting from LSW)
-	// if it's larger than 2^{c-1}, we have a carry we need to propagate up to the higher window
-	// var smallValues int 
-	digits, smallValues := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
 
-	// if we have more than 10% of small values, we split the processing of the first chunk in 2
-	// we may want to do that in innerMsm, but that would incur a cost of looping through all scalars one more time
-	splitFirstChunk := (float64(smallValues) / float64(len(scalars))) >= 0.1
-
-	_innerMsm{{ $.UPointName }}(p, C, points, digits, splitFirstChunk)
+	_innerMsm{{ $.UPointName }}(p, C, points, digits, chunkStats)
 
 	return p, nil
 }
@@ -447,7 +440,7 @@ func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determ
 	}
 }
 
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, splitFirstChunk bool) *{{ $.TJacobian }} {
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, chunkStats []chunkStat) *{{ $.TJacobian }} {
 	
 	nbChunks := computeNbChunks(c)
 
@@ -464,37 +457,31 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	processLastChunk := getChunkProcessor{{ $.UPointName }}(lastC(c))
-	go processLastChunk(uint64(nbChunks-1), chChunks[nbChunks-1], c, points, digits[int(nbChunks-1)*n:])
-
-	for j := int(nbChunks - 2); j >0; j-- {
-		processChunk := getChunkProcessor{{ $.UPointName }}(c)
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
 
-	// the first chunk may be imbalanced with the others, in particular for SNARK witness values [0,1]
-	// --> if that is the case, we launch 2 go routines to process the chunk to ensure all chunk are processed
-	// in the ~same amount of time
-	if nbChunks > 1 { // sanity check since we processed the "last chunk" up there, shouldn't happen.
+	for j := int(nbChunks - 1); j >= 0; j-- {
 		processChunk := getChunkProcessor{{ $.UPointName }}(c)
-		if !splitFirstChunk {
-			go processChunk(0,chChunks[0], c, points, digits[:n])
-		} else {
+		if j == int(nbChunks - 1) {
+			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c))
+		}
+		if chunkStats[j].weight >= 150.0 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
 			split := n / 2
-			go processChunk(0,chSplit, c, points[:split], digits[:split])
-			go processChunk(0,chSplit, c, points[split:], digits[split:n])
+			go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
 			go func() {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[0] <- s1
+				chChunks[j] <- s1
 			}()
-		}
-	
+			continue 
+		} 
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
-	
+
 	return msmReduceChunk{{ $.TAffine }}(p, int(c), chChunks[:])
 }
 

From 989a932fbe139eabb32b4264f98bbcb325d23dbb Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 11:57:01 -0600
Subject: [PATCH 25/43] test: added msm benchmarks with small values and
 redundancy

---
 ecc/bls12-377/multiexp_affine.go              |  2 +
 ecc/bls12-377/multiexp_test.go                | 76 ++++++++++++++++++-
 ecc/bls12-378/multiexp_affine.go              |  2 +
 ecc/bls12-378/multiexp_test.go                | 76 ++++++++++++++++++-
 ecc/bls12-381/multiexp_affine.go              |  2 +
 ecc/bls12-381/multiexp_test.go                | 76 ++++++++++++++++++-
 ecc/bls24-315/multiexp_affine.go              |  2 +
 ecc/bls24-315/multiexp_test.go                | 76 ++++++++++++++++++-
 ecc/bls24-317/multiexp_affine.go              |  2 +
 ecc/bls24-317/multiexp_test.go                | 76 ++++++++++++++++++-
 ecc/bn254/multiexp_affine.go                  |  2 +
 ecc/bn254/multiexp_test.go                    | 76 ++++++++++++++++++-
 ecc/bw6-633/multiexp_affine.go                |  2 +
 ecc/bw6-633/multiexp_test.go                  | 76 ++++++++++++++++++-
 ecc/bw6-756/multiexp_affine.go                |  2 +
 ecc/bw6-756/multiexp_test.go                  | 76 ++++++++++++++++++-
 ecc/bw6-761/multiexp_affine.go                |  2 +
 ecc/bw6-761/multiexp_test.go                  | 76 ++++++++++++++++++-
 .../ecc/template/multiexp_affine.go.tmpl      |  1 +
 .../ecc/template/tests/multiexp.go.tmpl       | 34 +++++++++
 20 files changed, 701 insertions(+), 36 deletions(-)

diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 8f33a80438..65e531185b 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 347fb0ab6b..3c8cf6ff05 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 09ee239d09..f48f316a4a 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 90b1321a9a..d693b2c8d7 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index f452ac210e..2e3776394d 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 5b0b8eb7cc..c22d9bc508 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index fa683c78a1..d253497f17 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 9bda4cefd3..6c169ffc9c 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 913b2e9308..d6a509fc82 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index c166598a34..5b36edf2ee 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 8f6e9073b8..939a1b71f2 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -164,6 +164,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -443,6 +444,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 5962e0b859..67cfc85953 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 870483e934..f3c51b51bf 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 45bef3125a..a65f05c5c2 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index ca6e7c172a..0925a40c35 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 57956e9c5a..5b81aea368 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 3b653b6563..6b4ec532ea 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -163,6 +163,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
@@ -370,6 +371,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 968613803a..38283453c4 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -222,11 +222,31 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
@@ -240,6 +260,20 @@ func BenchmarkMultiExpG1(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -503,11 +537,31 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// should split the scalars
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
+		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
@@ -521,6 +575,20 @@ func BenchmarkMultiExpG2(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using], ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 8462321866..513b06f96f 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -173,6 +173,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
+			processQueue()
 		}
 	}
 
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 455142ceb6..5e0298d575 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -237,9 +237,29 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	var (
 		samplePoints [nbSamples]{{ $.TAffine }}
 		sampleScalars [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
+	copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// should split the scalars
+	for i:=0; i < len(sampleScalarsSmallValues);i++ {
+		if i % 5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine
+	for i:=0; i < len(sampleScalarsRedundant);i+=10 {
+		for j:=i+1; j < i+10 && j < len(sampleScalarsRedundant);j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
+
 	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
 
 
@@ -254,6 +274,20 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 				testPoint.MultiExp(samplePoints[:using], sampleScalars[:using],ecc.MultiExpConfig{})
 			}
 		})
+
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 

From 52e5eaa8405c9f3ae5ae60c0933b1899d5c67ab8 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 12:28:04 -0600
Subject: [PATCH 26/43] test: update worst case benchmark for batch affine msm

---
 ecc/bls12-377/multiexp_test.go                       | 12 ++++++------
 ecc/bls12-378/multiexp_test.go                       | 12 ++++++------
 ecc/bls12-381/multiexp_test.go                       | 12 ++++++------
 ecc/bls24-315/multiexp_test.go                       | 12 ++++++------
 ecc/bls24-317/multiexp_test.go                       | 12 ++++++------
 ecc/bn254/multiexp_test.go                           | 12 ++++++------
 ecc/bw6-633/multiexp_test.go                         | 12 ++++++------
 ecc/bw6-756/multiexp_test.go                         | 12 ++++++------
 ecc/bw6-761/multiexp_test.go                         | 12 ++++++------
 .../generator/ecc/template/tests/multiexp.go.tmpl    |  6 +++---
 10 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 3c8cf6ff05..6425235853 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index d693b2c8d7..1b586aba7e 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index c22d9bc508..2b3ac662fc 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 6c169ffc9c..e0b3958b87 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 5b36edf2ee..7bc0eacb61 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 67cfc85953..d5840336a4 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index a65f05c5c2..244ae19386 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 5b81aea368..2a1f0cda97 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 38283453c4..589464949f 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -232,7 +232,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -241,8 +241,8 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
@@ -547,7 +547,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	copy(sampleScalarsSmallValues[:], sampleScalars[:])
 	copy(sampleScalarsRedundant[:], sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i := 0; i < len(sampleScalarsSmallValues); i++ {
 		if i%5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -556,8 +556,8 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i := 0; i < len(sampleScalarsRedundant); i += 10 {
-		for j := i + 1; j < i+10 && j < len(sampleScalarsRedundant); j++ {
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 5e0298d575..9b259ebca4 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -245,7 +245,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	copy(sampleScalarsSmallValues[:],sampleScalars[:])
 	copy(sampleScalarsRedundant[:],sampleScalars[:])
 
-	// should split the scalars
+	// this means first chunk is going to have more work to do and should be split into several go routines
 	for i:=0; i < len(sampleScalarsSmallValues);i++ {
 		if i % 5 == 0 {
 			sampleScalarsSmallValues[i].SetZero()
@@ -254,8 +254,8 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	}
 
 	// bad case for batch affine
-	for i:=0; i < len(sampleScalarsRedundant);i+=10 {
-		for j:=i+1; j < i+10 && j < len(sampleScalarsRedundant);j++ {
+	for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+		for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
 		}
 	}

From 52191c989a294085bcba460c35a2f33b82bd52da Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 14:30:35 -0600
Subject: [PATCH 27/43] feat: start to add statistics when parsing scalars in
 msm

---
 ecc/bls12-377/multiexp.go                     | 170 ++++++++++++++----
 ecc/bls12-377/multiexp_test.go                |   8 +-
 ecc/bls12-378/multiexp.go                     | 170 ++++++++++++++----
 ecc/bls12-378/multiexp_test.go                |   8 +-
 ecc/bls12-381/multiexp.go                     | 170 ++++++++++++++----
 ecc/bls12-381/multiexp_test.go                |   8 +-
 ecc/bls24-315/multiexp.go                     | 170 ++++++++++++++----
 ecc/bls24-315/multiexp_test.go                |   8 +-
 ecc/bls24-317/multiexp.go                     | 170 ++++++++++++++----
 ecc/bls24-317/multiexp_test.go                |   8 +-
 ecc/bn254/multiexp.go                         | 170 ++++++++++++++----
 ecc/bn254/multiexp_test.go                    |   8 +-
 ecc/bw6-633/multiexp.go                       | 122 ++++++++++---
 ecc/bw6-633/multiexp_test.go                  |   8 +-
 ecc/bw6-756/multiexp.go                       | 122 ++++++++++---
 ecc/bw6-756/multiexp_test.go                  |   8 +-
 ecc/bw6-761/multiexp.go                       | 122 ++++++++++---
 ecc/bw6-761/multiexp_test.go                  |   8 +-
 .../generator/ecc/template/multiexp.go.tmpl   | 106 +++++++++--
 .../ecc/template/tests/multiexp.go.tmpl       |   4 +-
 20 files changed, 1216 insertions(+), 352 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index b04f523cff..fdd6b005bf 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 6425235853..3f5ea45edd 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index cd49deb3de..711a527770 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 1b586aba7e..fd3aee65e6 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 4d7a2e07f5..7402fa0c62 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 2b3ac662fc..9ce352672c 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 3f49fd1afd..f1b13bdfc1 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index e0b3958b87..c40b9ccf21 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 29548cc89c..e1c79c6ba5 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 7bc0eacb61..87caea0886 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 87cee65e19..0fe60e8201 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -153,37 +152,51 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC10]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC11]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC12]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC13]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC14]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC15]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -212,24 +225,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -365,8 +378,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -382,37 +394,51 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 9:
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		if mustBeExt {
+		const batchSize = 80
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC10]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		if mustBeExt {
+		const batchSize = 150
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC11]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		if mustBeExt {
+		const batchSize = 200
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC12]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		if mustBeExt {
+		const batchSize = 350
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC13]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		if mustBeExt {
+		const batchSize = 400
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC14]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		if mustBeExt {
+		const batchSize = 500
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC15]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -441,24 +467,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -518,7 +544,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -555,9 +592,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -593,16 +635,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -610,24 +654,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index d5840336a4..378662496e 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index d77d85346f..c8d0d6518e 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -446,7 +448,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 244ae19386..622c98195b 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index ae313e1d52..72a0b35b7b 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -446,7 +448,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 2a1f0cda97..f7488f5dd7 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 1c7ae6f672..32920cef38 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -136,8 +136,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -147,7 +146,9 @@ func getChunkProcessorG1(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG1Jacobian[bucketg1JacExtendedC16]
 		}
 		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
@@ -176,24 +177,24 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG1(c)
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c))
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g1JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -329,8 +330,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
 	case 4:
@@ -340,7 +340,9 @@ func getChunkProcessorG2(c uint64 /* some other params to determine extJac*/) fu
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		if mustBeExt {
+		const batchSize = 640
+		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+		if edgeCaseAffine {
 			return processChunkG2Jacobian[bucketg2JacExtendedC16]
 		}
 		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
@@ -369,24 +371,24 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessorG2(c)
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c))
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan g2JacExtended, 2)
 			split := n / 2
 			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue
 		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
@@ -446,7 +448,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -483,9 +496,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i := 0; i < len(opsPerBucketPerChunk); i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -521,16 +539,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue
 				}
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -538,24 +558,72 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i] += nbOps
 			totalOps += nbOps
 		}
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i := 0; i < len(opsPerBucketPerChunk); i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		}
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v
+		}
+		return v
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes.
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation = aad / nz
+	}
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
 
 	return digits, chunkStats
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 589464949f..c7fdcf64cf 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -240,7 +240,9 @@ func BenchmarkMultiExpG1(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
@@ -555,7 +557,9 @@ func BenchmarkMultiExpG2(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
 		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 4aa916fcfc..a274e26e66 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -55,7 +55,18 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	weight float32 // relative weight compared to other chunks.
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight. 
+	weight int
+
+	// average absolute deviation. this is meant to give a sense of statistical 
+	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	deviation int 
+
+	// count the number of buckets that are non zeroes for this chunk
+	nonZeroBuckets int
+
+	// average ops per non-zero buckets
+	averageOpsPerBucket int 
 }
 
 
@@ -94,9 +105,14 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	}
 
 	chOpsPerChunk := make(chan []int, nbTasks)
+	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		opsPerChunk := make([]int, nbChunks)
+		opsPerBucketPerChunk := make([][]int, nbChunks)
+		for i:=0; i < len(opsPerBucketPerChunk);i++ {
+			opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets
+		}
 		for i:=start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -133,16 +149,18 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 					carry = 1
 				}
 
-				var bits uint16
-
 				// if digit is zero, no impact on result
 				if digit == 0 {
 					continue 
 				} 
+
+				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
+					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
+					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 				opsPerChunk[chunk]++
@@ -150,26 +168,77 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 		chOpsPerChunk <- opsPerChunk
+		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
 
 	}, nbTasks)
 	
 	
 	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
 	close(chOpsPerChunk)
+	close(chOpsPerBucketPerChunk)
 	opsPerChunk := make([]int, nbChunks)
 	totalOps := 0
-	for o := range chOpsPerChunk {
-		for i, nbOps := range o {
+	for chunks := range chOpsPerChunk {
+		for i, nbOps := range chunks {
 			opsPerChunk[i]+=nbOps
 			totalOps += nbOps
 		} 
 	}
-	chunkStats := make([]chunkStat, nbChunks)
-	target := float32(totalOps) / float32(nbChunks)
+
+
+	opsPerBucketPerChunk := make([][]int, nbChunks)
+	for i:=0; i < len(opsPerBucketPerChunk);i++ {
+		opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets
+	}
+	for chunks := range chOpsPerBucketPerChunk {
+		for i, opsPerBucket := range chunks {
+			for j, o := range opsPerBucket {
+				// bucket j in chunk i has o operations
+				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
+					chunkStats[i].nonZeroBuckets++
+				}
+				opsPerBucketPerChunk[i][j] += o
+			}
+		} 
+	}
+
+	abs := func(v int) int {
+		if v < 0 {
+			return -v 
+		}
+		return v 
+	}
+
+	// we know the total ops for the chunk, the number of non zero buckets
+	// so we can compute the deviation;
+	// TODO @gbotrel do that in go routines
+	for chunkID:=0; chunkID < len(chunkStats); chunkID++ {
+		nz := chunkStats[chunkID].nonZeroBuckets
+		if nz == 0 {
+			continue // ignore chunk, full of zeroes. 
+		}
+		mean := opsPerChunk[chunkID] / nz
+		aad := 0
+		averageOpsPerBucket := 0
+		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
+			aad += abs(bucketOps - mean)
+			averageOpsPerBucket += bucketOps 
+		}
+		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+		chunkStats[chunkID].deviation =  aad / nz
+	}
+	
+
+	target := totalOps / int(nbChunks)
 	// what percentage are you of the target
-	for i := 0; i < len(chunkStats); i++ {
-		chunkStats[i].weight = float32(opsPerChunk[i]) * 100.0 / target
+	if target != 0 {
+		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
+		for i := 0; i < len(chunkStats); i++ {
+			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+		}
 	}
+	
 
 	return digits, chunkStats
 }
@@ -420,15 +489,16 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 }
 
 
-func getChunkProcessor{{ $.UPointName }}(c uint64 /* some other params to determine extJac*/) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
-	mustBeExt := false
+func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
 	switch c {
 		{{range $c :=  $.CRange}}
 		case {{$c}}:
 			{{- if le $c 9}}
 				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 			{{- else}}
-				if mustBeExt {
+				const batchSize = {{batchSize $c}}
+				edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
+				if edgeCaseAffine {
 					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 				}
 				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
@@ -459,24 +529,24 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	n := len(points)
 
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		processChunk := getChunkProcessor{{ $.UPointName }}(c)
+		processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j])
 		if j == int(nbChunks - 1) {
-			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c))
+			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j])
 		}
-		if chunkStats[j].weight >= 150.0 {
+		if chunkStats[j].weight >= 115 {
 			// we split this in more go routines since this chunk has more work to do than the others.
 			// else what would happen is this go routine would finish much later than the others.
 			chSplit := make(chan {{ $.TJacobianExtended }}, 2)
 			split := n / 2
 			go processChunk(uint64(j),chSplit, c, points[:split], digits[j*n:(j*n)+split])
 			go processChunk(uint64(j),chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func() {
+			go func(chunkID int) {
 				s1 := <-chSplit
 				s2 := <-chSplit
 				close(chSplit)
 				s1.add(&s2)
-				chChunks[j] <- s1
-			}()
+				chChunks[chunkID] <- s1
+			}(j)
 			continue 
 		} 
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 9b259ebca4..f208891b47 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -253,7 +253,9 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 		}
 	}
 
-	// bad case for batch affine
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
 	for i:=0; i < len(sampleScalarsRedundant);i+=100 {
 		for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
 			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]

From 1a91d5a74bb1e41cd93b9600e29cdc474876ddac Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Tue, 15 Nov 2022 17:38:49 -0600
Subject: [PATCH 28/43] checkpoint

---
 ecc/bls12-377/multiexp.go                     | 364 +++++++++++-------
 ecc/bls12-377/multiexp_affine.go              | 104 ++++-
 ecc/bls12-377/multiexp_test.go                |   4 +-
 ecc/bls12-378/multiexp.go                     | 364 +++++++++++-------
 ecc/bls12-378/multiexp_affine.go              | 104 ++++-
 ecc/bls12-378/multiexp_test.go                |   4 +-
 ecc/bls12-381/multiexp.go                     | 364 +++++++++++-------
 ecc/bls12-381/multiexp_affine.go              | 104 ++++-
 ecc/bls12-381/multiexp_test.go                |   4 +-
 ecc/bls24-315/multiexp.go                     | 364 +++++++++++-------
 ecc/bls24-315/multiexp_affine.go              | 104 ++++-
 ecc/bls24-315/multiexp_test.go                |   4 +-
 ecc/bls24-317/multiexp.go                     | 364 +++++++++++-------
 ecc/bls24-317/multiexp_affine.go              | 104 ++++-
 ecc/bls24-317/multiexp_test.go                |   4 +-
 ecc/bn254/multiexp.go                         | 364 +++++++++++-------
 ecc/bn254/multiexp_affine.go                  | 104 ++++-
 ecc/bn254/multiexp_test.go                    |   4 +-
 ecc/bw6-633/multiexp.go                       | 166 ++++----
 ecc/bw6-633/multiexp_affine.go                | 104 ++++-
 ecc/bw6-633/multiexp_test.go                  |   4 +-
 ecc/bw6-756/multiexp.go                       | 166 ++++----
 ecc/bw6-756/multiexp_affine.go                | 104 ++++-
 ecc/bw6-756/multiexp_test.go                  |   4 +-
 ecc/bw6-761/multiexp.go                       | 166 ++++----
 ecc/bw6-761/multiexp_affine.go                | 104 ++++-
 ecc/bw6-761/multiexp_test.go                  |   4 +-
 .../generator/ecc/template/multiexp.go.tmpl   | 142 +++----
 .../ecc/template/multiexp_affine.go.tmpl      |  50 ++-
 .../ecc/template/tests/multiexp.go.tmpl       |   2 +-
 30 files changed, 2482 insertions(+), 1366 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index fdd6b005bf..b0c3c27e36 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -18,6 +18,7 @@ package bls12377
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 65e531185b..e953a6f079 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 3f5ea45edd..38d85e79ea 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 711a527770..5833c4e009 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -18,6 +18,7 @@ package bls12378
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index f48f316a4a..533cb7304c 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index fd3aee65e6..846eab44a1 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 7402fa0c62..50b6f180d3 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -18,6 +18,7 @@ package bls12381
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 2e3776394d..61ebeeebb0 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 9ce352672c..65e74e0491 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index f1b13bdfc1..3c05caa715 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -18,6 +18,7 @@ package bls24315
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index d253497f17..85af6357fd 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index c40b9ccf21..1eb2ff3e0f 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index e1c79c6ba5..2f80a71f5c 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -18,6 +18,7 @@ package bls24317
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index d6a509fc82..a6fcb5a2f8 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 87caea0886..14af45e5b5 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 0fe60e8201..0767ef87aa 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -18,6 +18,7 @@ package bn254
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,66 +141,100 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -223,8 +258,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -382,66 +419,100 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		const batchSize = 80
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+		// const batchSize = 80
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		const batchSize = 150
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+		// const batchSize = 150
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		const batchSize = 200
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+		// const batchSize = 200
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		const batchSize = 350
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
+		// const batchSize = 350
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		const batchSize = 400
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
+		// const batchSize = 400
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		const batchSize = 500
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
+		// const batchSize = 500
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -465,8 +536,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -558,6 +631,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -591,15 +668,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -643,50 +712,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -694,31 +728,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 939a1b71f2..044f6834fb 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -36,7 +36,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -45,8 +45,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -71,6 +73,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -115,12 +142,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -154,8 +188,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -164,15 +197,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -316,7 +350,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -325,8 +359,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -351,6 +387,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -395,12 +456,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -434,8 +502,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -444,15 +511,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 378662496e..b3a812b962 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index c8d0d6518e..6bc12d487a 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -18,6 +18,7 @@ package bw6633
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -462,6 +481,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index f3c51b51bf..1de033659f 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 622c98195b..95312917d9 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 72a0b35b7b..951ee4bd04 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -18,6 +18,7 @@ package bw6756
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -462,6 +481,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 0925a40c35..5b99051b82 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index f7488f5dd7..a94008b836 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 32920cef38..fe9a1970a4 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -18,6 +18,7 @@ package bw6761
 
 import (
 	"errors"
+	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -140,18 +141,25 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG1Jacobian[bucketg1JacExtendedC16]
@@ -175,8 +183,10 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -334,18 +344,25 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
+		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		const batchSize = 640
-		edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-		if edgeCaseAffine {
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+		// const batchSize = 640
+		// status: we are losing in perf here in the nominal case.
+		// stat.deviation seems not good.
+		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+		// if edgeCaseAffine {
+		// 	// fmt.Printf("jacobian \n")
+		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		// }
+		// fmt.Printf("affine \n")
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
 		return processChunkG2Jacobian[bucketg2JacExtendedC16]
@@ -369,8 +386,10 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -462,6 +481,10 @@ type chunkStat struct {
 	averageOpsPerBucket int
 }
 
+func (c *chunkStat) String() string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 // 2^{c} to the current digit, making it negative.
@@ -495,15 +518,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
-
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i := 0; i < len(opsPerBucketPerChunk); i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-		}
 		for i := start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -547,50 +562,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 
-	// aggregate  chunk stats
-	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i] += nbOps
-			totalOps += nbOps
-		}
-	}
-
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i := 0; i < len(opsPerBucketPerChunk); i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c - 1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
-				}
-				opsPerBucketPerChunk[i][j] += o
-			}
-		}
-	}
-
 	abs := func(v int) int {
 		if v < 0 {
 			return -v
@@ -598,31 +578,67 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		return v
 	}
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID := 0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes.
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps
+	// aggregate  chunk stats
+	chunkStats := make([]chunkStat, nbChunks)
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
+
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
+
+			bound := 1 << (c - 1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c) - 1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b := 0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue
+				}
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
+			}
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation = aad / nz
 		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation = aad / nz
+	}, nbTasks)
+
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps += stat.weight
 	}
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 6b4ec532ea..885062c223 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -35,7 +35,7 @@ func (o batchOpG1Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
+func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Affine, TQ qOpsG1Affine, TC cG1Affine](
 	chunk uint64,
 	chRes chan<- g1JacExtended,
 	c uint64,
@@ -44,8 +44,10 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -70,6 +72,31 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG1Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G1Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -114,12 +141,19 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -153,8 +187,7 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -163,15 +196,16 @@ func processChunkG1BatchAffine[B ibG1Affine, BS bitSet, TP pG1Affine, TPP ppG1Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
@@ -243,7 +277,7 @@ func (o batchOpG2Affine) isNeg() bool {
 //
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
+func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Affine, TQ qOpsG2Affine, TC cG2Affine](
 	chunk uint64,
 	chRes chan<- g2JacExtended,
 	c uint64,
@@ -252,8 +286,10 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -278,6 +314,31 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOpG2Affine) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
+
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *G2Affine, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -322,12 +383,19 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		cptAdd++
 	}
 
+	flushQueue := func() {
+		for i := 0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
+
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -361,8 +429,7 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 
 			// queue is full, flush it.
 			if qID == len(queue)-1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -371,15 +438,16 @@ func processChunkG2BatchAffine[B ibG2Affine, BS bitSet, TP pG2Affine, TPP ppG2Af
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index c7fdcf64cf..7d9abce749 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index a274e26e66..f650f65dbc 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -14,6 +14,7 @@ import (
 	"errors"
 	"math"
 	"runtime"
+	"fmt"
 )
 
 {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
@@ -69,6 +70,10 @@ type chunkStat struct {
 	averageOpsPerBucket int 
 }
 
+func (c *chunkStat) String()  string {
+	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+}
+
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
 // if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
@@ -104,15 +109,8 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		selectors[chunk] = d
 	}
 
-	chOpsPerChunk := make(chan []int, nbTasks)
-	chOpsPerBucketPerChunk := make(chan [][]int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
-		opsPerChunk := make([]int, nbChunks)
-		opsPerBucketPerChunk := make([][]int, nbChunks)
-		for i:=0; i < len(opsPerBucketPerChunk);i++ {
-			opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets
-		}
 		for i:=start; i < end; i++ {
 			scalar := scalars[i]
 			if scalarsMont {
@@ -157,85 +155,84 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				var bits uint16
 				if digit > 0 {
 					bits = uint16(digit) << 1
-					opsPerBucketPerChunk[chunk][uint16(digit)]++
 				} else {
 					bits = (uint16(-digit-1) << 1) + 1
-					opsPerBucketPerChunk[chunk][uint16(-digit-1)]++
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
-				opsPerChunk[chunk]++
 			}
 		}
 
-		chOpsPerChunk <- opsPerChunk
-		chOpsPerBucketPerChunk <- opsPerBucketPerChunk
-
 	}, nbTasks)
 	
+	abs := func(v int) int {
+		if v < 0 {
+			return -v 
+		}
+		return v 
+	}
 	
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
-	close(chOpsPerChunk)
-	close(chOpsPerBucketPerChunk)
-	opsPerChunk := make([]int, nbChunks)
-	totalOps := 0
-	for chunks := range chOpsPerChunk {
-		for i, nbOps := range chunks {
-			opsPerChunk[i]+=nbOps
-			totalOps += nbOps
-		} 
-	}
+	parallel.Execute(len(chunkStats), func(start, end int) {
+		// for each chunk compute the statistics
+		for chunkID := start; chunkID < end; chunkID++ {
+			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// digits for the chunk
+			chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)]
+
+			totalOps := 0
+			nz := 0 // non zero buckets count
+			for _, digit := range chunkDigits {
+				if digit == 0 {
+					continue 
+				}
+				totalOps++
+				bucketID := digit >> 1
+				if digit &1 == 0 {
+					bucketID-=1
+				}
+				if opsPerBucket[bucketID] == 0 {
+					nz++
+				}
+				opsPerBucket[bucketID]++
+			}
+			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].nonZeroBuckets = nz
 
+		
+			if nz == 0 {
+				return // no ops, only zeroes
+			}
 
-	opsPerBucketPerChunk := make([][]int, nbChunks)
-	for i:=0; i < len(opsPerBucketPerChunk);i++ {
-		opsPerBucketPerChunk[i] = make([]int, (1 << (c-1))) // nbBuckets
-	}
-	for chunks := range chOpsPerBucketPerChunk {
-		for i, opsPerBucket := range chunks {
-			for j, o := range opsPerBucket {
-				// bucket j in chunk i has o operations
-				if opsPerBucketPerChunk[i][j] == 0 && o != 0 {
-					chunkStats[i].nonZeroBuckets++
+			bound := 1 << (c-1)
+			if chunkID == int(nbChunks-1) {
+				bound = 1 << (lastC(c)-1)
+			}
+			mean := totalOps / nz
+			aad := 0
+			averageOpsPerBucket := 0
+			for b:=0; b < bound; b++ {
+				if opsPerBucket[b] == 0 {
+					continue 
 				}
-				opsPerBucketPerChunk[i][j] += o
+				aad += abs(opsPerBucket[b] - mean)
+				averageOpsPerBucket += opsPerBucket[b]
 			}
-		} 
-	}
-
-	abs := func(v int) int {
-		if v < 0 {
-			return -v 
+			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			chunkStats[chunkID].deviation =  aad / nz
 		}
-		return v 
-	}
+	}, nbTasks)
 
-	// we know the total ops for the chunk, the number of non zero buckets
-	// so we can compute the deviation;
-	// TODO @gbotrel do that in go routines
-	for chunkID:=0; chunkID < len(chunkStats); chunkID++ {
-		nz := chunkStats[chunkID].nonZeroBuckets
-		if nz == 0 {
-			continue // ignore chunk, full of zeroes. 
-		}
-		mean := opsPerChunk[chunkID] / nz
-		aad := 0
-		averageOpsPerBucket := 0
-		for _, bucketOps := range opsPerBucketPerChunk[chunkID] {
-			aad += abs(bucketOps - mean)
-			averageOpsPerBucket += bucketOps 
-		}
-		chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-		chunkStats[chunkID].deviation =  aad / nz
+	totalOps := 0
+	for _, stat := range chunkStats {
+		totalOps+=stat.weight
 	}
-	
 
 	target := totalOps / int(nbChunks)
-	// what percentage are you of the target
 	if target != 0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = opsPerChunk[i] * 100 / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
 		}
 	}
 	
@@ -494,14 +491,19 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID
 		{{range $c :=  $.CRange}}
 		case {{$c}}:
 			{{- if le $c 9}}
+			// fmt.Printf("jacobian \n")
 				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 			{{- else}}
-				const batchSize = {{batchSize $c}}
-				edgeCaseAffine := (stat.nonZeroBuckets < (batchSize)) || (stat.deviation >= 4) // stat.averageOpsPerBucket/3)
-				if edgeCaseAffine {
-					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-				}
-				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
+				// const batchSize = {{batchSize $c}}
+				// status: we are losing in perf here in the nominal case. 
+				// stat.deviation seems not good. 
+				// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
+				// if edgeCaseAffine {
+				// 	// fmt.Printf("jacobian \n")
+				// 	return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+				// }
+				// fmt.Printf("affine \n")
+				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 			{{- end}}
 		{{- end}}
 		default:
@@ -527,8 +529,10 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-
+	// fmt.Printf("\n")
+	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
+		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
 		processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j])
 		if j == int(nbChunks - 1) {
 			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j])
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 513b06f96f..1763f28292 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -41,7 +41,7 @@ func (o batchOp{{ $.TAffine }}) isNeg() bool {
 // 
 // this is derived from a PR by 0x0ece : https://github.com/ConsenSys/gnark-crypto/pull/249
 // See Section 5.3: ia.cr/2022/1396
-func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}](
+func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B ib{{ $.TAffine }}, BS bitSet, TP p{{ $.TAffine }}, TPP pp{{ $.TAffine }}, TQ qOps{{ $.TAffine }}, TC c{{ $.TAffine}}](
 	 chunk uint64,
 	 chRes chan<- {{ $.TJacobianExtended }},
 	 c uint64,
@@ -50,8 +50,10 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 
 	// init the buckets
 	var buckets B
+	var bucketsJE BJE
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
+		bucketsJE[i].setInfinity()
 	}
 
 	// setup for the batch affine;
@@ -77,7 +79,31 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		bucketIds = tmp
 		cptAdd = 0
 	}
+	addFromQueue := func(op batchOp{{ $.TAffine }}) {
+		// @precondition: ensures bucket is not "used" in current batch
+		BK := &buckets[op.bucketID]
+		// handle special cases with inf or -P / P
+		if BK.IsInfinity() {
+			BK.Set(&op.point)
+			return
+		}
+		if BK.X.Equal(&op.point.X) {
+			if BK.Y.Equal(&op.point.Y) {
+				// P + P: doubling, which should be quite rare --
+				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
+				// need doubling in affine implemented ?
+				BK.Add(BK, BK)
+				return
+			}
+			BK.setInfinity()
+			return
+		}
 
+		bucketIds[op.bucketID] = true
+		R[cptAdd] = BK
+		P[cptAdd] = op.point
+		cptAdd++
+	}
 
 	add := func(bucketID uint16, PP *{{$.TAffine}}, isAdd bool) {
 		// @precondition: ensures bucket is not "used" in current batch
@@ -122,13 +148,19 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		cptAdd++
 	}
 
+	flushQueue := func () {
+		for i:=0; i < qID; i++ {
+			bucketsJE[queue[i].bucketID].addMixed(&queue[i].point)
+		}
+		qID = 0
+	}
 
 	processQueue := func () {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				continue
 			}
-			add(queue[i].bucketID, &queue[i].point, true)
+			addFromQueue(queue[i])
 			if isFull() {
 				executeAndReset()
 			}
@@ -163,8 +195,7 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 
 			// queue is full, flush it.
 			if qID == len(queue) - 1 {
-				executeAndReset()
-				processQueue()
+				flushQueue()
 			}
 			continue
 		}
@@ -173,15 +204,16 @@ func processChunk{{ $.UPointName }}BatchAffine[B ib{{ $.TAffine }}, BS bitSet, T
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue()
+			processQueue() // TODO top queue only
 		}
 	}
 
 	// empty the queue
-	for qID != 0 {
-		processQueue()
-		executeAndReset()
-	}
+	flushQueue()
+	// for qID != 0 {
+	// 	processQueue()
+	// 	executeAndReset()
+	// }
 
 	// flush items in batch.
 	executeAndReset()
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index f208891b47..91af5a996b 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -267,7 +267,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 15; i <= pow; i++ {
+	for i := 22; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {

From fcdcbfdcf58800025a9968b08a0ef46e3a0bbdac Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 07:28:14 -0600
Subject: [PATCH 29/43] checkpoint

---
 ecc/bls12-377/multiexp.go                     | 76 ++++++++--------
 ecc/bls12-377/multiexp_affine.go              | 86 +++++++++++++------
 ecc/bls12-378/multiexp.go                     | 76 ++++++++--------
 ecc/bls12-378/multiexp_affine.go              | 86 +++++++++++++------
 ecc/bls12-381/multiexp.go                     | 76 ++++++++--------
 ecc/bls12-381/multiexp_affine.go              | 86 +++++++++++++------
 ecc/bls24-315/multiexp.go                     | 76 ++++++++--------
 ecc/bls24-315/multiexp_affine.go              | 86 +++++++++++++------
 ecc/bls24-317/multiexp.go                     | 76 ++++++++--------
 ecc/bls24-317/multiexp_affine.go              | 86 +++++++++++++------
 ecc/bn254/multiexp.go                         | 76 ++++++++--------
 ecc/bn254/multiexp_affine.go                  | 86 +++++++++++++------
 ecc/bw6-633/multiexp.go                       | 76 ++++++++--------
 ecc/bw6-633/multiexp_affine.go                | 86 +++++++++++++------
 ecc/bw6-756/multiexp.go                       | 76 ++++++++--------
 ecc/bw6-756/multiexp_affine.go                | 86 +++++++++++++------
 ecc/bw6-761/multiexp.go                       | 76 ++++++++--------
 ecc/bw6-761/multiexp_affine.go                | 86 +++++++++++++------
 .../generator/ecc/template/multiexp.go.tmpl   | 74 ++++++++--------
 .../ecc/template/multiexp_affine.go.tmpl      | 43 +++++++---
 20 files changed, 931 insertions(+), 644 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index b0c3c27e36..bc85684256 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index e953a6f079..b2c167e9bc 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 5833c4e009..7164359d9f 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 533cb7304c..af73df4882 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 50b6f180d3..902a6245f9 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 61ebeeebb0..992efc3fa4 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 3c05caa715..16a77feb26 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 85af6357fd..ac3db048e6 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 2f80a71f5c..9c8c78a3f6 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index a6fcb5a2f8..4e241f79a5 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 0767ef87aa..b06acb685c 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -721,61 +721,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 044f6834fb..a268d19c87 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -23,11 +23,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -46,6 +48,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -60,6 +63,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -77,7 +81,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -90,6 +95,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -103,12 +109,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -120,12 +127,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -152,13 +161,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -180,8 +190,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -201,6 +213,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -208,9 +223,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -218,9 +230,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -337,11 +352,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -360,6 +377,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -374,6 +392,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -391,7 +410,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -404,6 +424,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -417,12 +438,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -434,12 +456,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -466,13 +490,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -494,8 +519,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -515,6 +542,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -522,9 +552,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -532,9 +559,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 6bc12d487a..1bc8b9e0ef 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 1de033659f..a10b6cbf76 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -22,11 +22,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 951ee4bd04..e695c8791f 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 5b99051b82..00e9053fd1 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -22,11 +22,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index fe9a1970a4..b89e06e566 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -571,61 +571,61 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	abs := func(v int) int {
-		if v < 0 {
-			return -v
-		}
-		return v
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v
+	// 	}
+	// 	return v
+	// }
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit&1 == 0 {
-					bucketID -= 1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
-
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c - 1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c) - 1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b := 0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation = aad / nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
+
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 885062c223..13f133daa5 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -22,11 +22,13 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	point    G1Affine
+	// pointID uint32
+	point G1Affine
 }
 
 func (o batchOpG1Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,6 +47,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -59,6 +62,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
@@ -76,7 +80,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -89,6 +94,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -102,12 +108,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -119,12 +126,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -151,13 +160,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -179,8 +189,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -200,6 +212,9 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -207,9 +222,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -217,9 +229,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
@@ -264,11 +279,13 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	point    G2Affine
+	// pointID uint32
+	point G2Affine
 }
 
 func (o batchOpG2Affine) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -287,6 +304,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -301,6 +319,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
+	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
@@ -318,7 +337,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID] {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -331,6 +351,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -344,12 +365,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID] {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -361,12 +383,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
+					bucketSet[bucketID] = false
 				}
 
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -393,13 +417,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	processQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -421,8 +446,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -442,6 +469,9 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		}
 	}
 
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -449,9 +479,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
-
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
 
@@ -459,9 +486,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k] {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index f650f65dbc..b9563c9a61 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -164,62 +164,62 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 	
-	abs := func(v int) int {
-		if v < 0 {
-			return -v 
-		}
-		return v 
-	}
+	// abs := func(v int) int {
+	// 	if v < 0 {
+	// 		return -v 
+	// 	}
+	// 	return v 
+	// }
 	
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			var opsPerBucket [1 << 15]int // max value is 16 for c
+			// var opsPerBucket [1 << 15]int // max value is 16 for c
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			nz := 0 // non zero buckets count
+			// nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue 
 				}
 				totalOps++
-				bucketID := digit >> 1
-				if digit &1 == 0 {
-					bucketID-=1
-				}
-				if opsPerBucket[bucketID] == 0 {
-					nz++
-				}
-				opsPerBucket[bucketID]++
+				// bucketID := digit >> 1
+				// if digit &1 == 0 {
+				// 	bucketID-=1
+				// }
+				// if opsPerBucket[bucketID] == 0 {
+				// 	nz++
+				// }
+				// opsPerBucket[bucketID]++
 			}
 			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			chunkStats[chunkID].nonZeroBuckets = nz
+			// chunkStats[chunkID].nonZeroBuckets = nz
 
 		
-			if nz == 0 {
-				return // no ops, only zeroes
-			}
-
-			bound := 1 << (c-1)
-			if chunkID == int(nbChunks-1) {
-				bound = 1 << (lastC(c)-1)
-			}
-			mean := totalOps / nz
-			aad := 0
-			averageOpsPerBucket := 0
-			for b:=0; b < bound; b++ {
-				if opsPerBucket[b] == 0 {
-					continue 
-				}
-				aad += abs(opsPerBucket[b] - mean)
-				averageOpsPerBucket += opsPerBucket[b]
-			}
-			chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			chunkStats[chunkID].deviation =  aad / nz
+			// if nz == 0 {
+			// 	return // no ops, only zeroes
+			// }
+
+			// bound := 1 << (c-1)
+			// if chunkID == int(nbChunks-1) {
+			// 	bound = 1 << (lastC(c)-1)
+			// }
+			// mean := totalOps / nz
+			// aad := 0
+			// averageOpsPerBucket := 0
+			// for b:=0; b < bound; b++ {
+			// 	if opsPerBucket[b] == 0 {
+			// 		continue 
+			// 	}
+			// 	aad += abs(opsPerBucket[b] - mean)
+			// 	averageOpsPerBucket += opsPerBucket[b]
+			// }
+			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
+			// chunkStats[chunkID].deviation =  aad / nz
 		}
 	}, nbTasks)
 
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 1763f28292..6463bc1ede 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -28,11 +28,13 @@ import (
 
 type batchOp{{ $.TAffine }} struct {
 	bucketID uint16
+	// pointID uint32
 	point {{ $.TAffine }}
 }
 
 func (o batchOp{{ $.TAffine }}) isNeg() bool {
-	return o.bucketID&1 == 1
+	return false
+	// return o.pointID&1 == 1
 }
 
 // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm
@@ -51,6 +53,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	// init the buckets
 	var buckets B
 	var bucketsJE BJE
+	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -65,6 +68,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		queue TQ // queue of points that conflict the current batch
 		qID int // current position in queue
 	)
+	// var queue [batchSize]batchOp{{ $.TAffine}}
 	
 	batchSize := len(P)
 	
@@ -83,7 +87,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[op.bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[op.bucketID]  {
+			bucketSet[op.bucketID] = true
 			BK.Set(&op.point)
 			return
 		}
@@ -96,6 +101,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 				return
 			}
 			BK.setInfinity()
+			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,12 +115,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if BK.IsInfinity() {
+		if !bucketSet[bucketID]  {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
+			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -125,13 +132,15 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 				if isAdd {
 					BK.Add(BK, BK)
 				} else {
-					BK.setInfinity()		
+					BK.setInfinity()	
+					bucketSet[bucketID] = false	
 				}
 				
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
+				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,13 +167,14 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	processQueue := func () {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
-				continue
+				return
 			}
 			addFromQueue(queue[i])
-			if isFull() {
-				executeAndReset()
-			}
-			queue[i] = queue[qID-1]
+			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
+			// if isFull() {
+			// 	executeAndReset()
+			// }
+			// queue[i] = queue[qID-1]
 			qID--
 		}
 	}
@@ -187,8 +197,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				queue[qID].point = points[i]
+				// queue[qID].pointID = uint32(i << 1)
+				queue[qID].point.Set(&points[i])	
 			} else {
+				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -208,6 +220,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		}
 	}
 
+
+	// flush items in batch.
+	executeAndReset()
+
 	// empty the queue
 	flushQueue()
 	// for qID != 0 {
@@ -215,8 +231,6 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	// 	executeAndReset()
 	// }
 
-	// flush items in batch.
-	executeAndReset()
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
@@ -225,9 +239,12 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if !buckets[k].IsInfinity() {
+		if bucketSet[k]  {
 			runningSum.addMixed(&buckets[k])
 		}
+		if !bucketsJE[k].ZZ.IsZero() {
+			runningSum.add(&bucketsJE[k])
+		}
 		total.add(&runningSum)
 	}
 

From b096408205a69b35ae735136b2280cf7582c55ba Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 09:37:44 -0600
Subject: [PATCH 30/43] checkpoint

---
 ecc/bls12-377/multiexp.go                     | 335 +++++++-----------
 ecc/bls12-377/multiexp_test.go                | 148 ++++----
 ecc/bls12-378/multiexp.go                     | 335 +++++++-----------
 ecc/bls12-378/multiexp_test.go                | 148 ++++----
 ecc/bls12-381/multiexp.go                     | 335 +++++++-----------
 ecc/bls12-381/multiexp_test.go                | 148 ++++----
 ecc/bls24-315/multiexp.go                     | 335 +++++++-----------
 ecc/bls24-315/multiexp_test.go                | 148 ++++----
 ecc/bls24-317/multiexp.go                     | 335 +++++++-----------
 ecc/bls24-317/multiexp_test.go                | 148 ++++----
 ecc/bn254/multiexp.go                         | 335 +++++++-----------
 ecc/bn254/multiexp_test.go                    | 148 ++++----
 ecc/bw6-633/multiexp.go                       | 137 +++----
 ecc/bw6-633/multiexp_test.go                  | 148 ++++----
 ecc/bw6-756/multiexp.go                       | 137 +++----
 ecc/bw6-756/multiexp_test.go                  | 148 ++++----
 ecc/bw6-761/multiexp.go                       | 137 +++----
 ecc/bw6-761/multiexp_test.go                  | 148 ++++----
 .../generator/ecc/template/multiexp.go.tmpl   | 110 ++----
 .../ecc/template/tests/multiexp.go.tmpl       |  70 ++--
 20 files changed, 1657 insertions(+), 2276 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index bc85684256..b87523e4e0 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -18,7 +18,6 @@ package bls12377
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 38d85e79ea..8a036b0f54 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 7164359d9f..702015ccb8 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -18,7 +18,6 @@ package bls12378
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 846eab44a1..1e9ff1e4de 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 902a6245f9..91e471a850 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -18,7 +18,6 @@ package bls12381
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 65e74e0491..bb1a3ac61e 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 16a77feb26..46a8c8bd4f 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -18,7 +18,6 @@ package bls24315
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 1eb2ff3e0f..f44d8d7b81 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 9c8c78a3f6..d5232436d2 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -18,7 +18,6 @@ package bls24317
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 14af45e5b5..b3faa9e76f 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index b06acb685c..6c68b58cdd 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -18,7 +18,6 @@ package bn254
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,99 +140,79 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC13]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC13, bucketG1AffineC13, bitSetC13, pG1AffineC13, ppG1AffineC13, qG1AffineC13, cG1AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC14]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC14, bucketG1AffineC14, bitSetC14, pG1AffineC14, ppG1AffineC14, qG1AffineC14, cG1AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC15]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC15, bucketG1AffineC15, bitSetC15, pG1AffineC15, ppG1AffineC15, qG1AffineC15, cG1AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -247,8 +226,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -258,10 +236,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -419,99 +395,79 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 6:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 7:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC7]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 9:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC9]
 	case 10:
-		// const batchSize = 80
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 11:
-		// const batchSize = 150
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 12:
-		// const batchSize = 200
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 13:
-		// const batchSize = 350
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC13]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 350
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC13]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC13, bucketG2AffineC13, bitSetC13, pG2AffineC13, ppG2AffineC13, qG2AffineC13, cG2AffineC13]
 	case 14:
-		// const batchSize = 400
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC14]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 400
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC14]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC14, bucketG2AffineC14, bitSetC14, pG2AffineC14, ppG2AffineC14, qG2AffineC14, cG2AffineC14]
 	case 15:
-		// const batchSize = 500
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC15]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 500
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC15]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC15, bucketG2AffineC15, bitSetC15, pG2AffineC15, ppG2AffineC15, qG2AffineC15, cG2AffineC15]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -525,8 +481,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -536,10 +491,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -618,21 +571,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
-
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	weight float32
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -721,74 +671,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index b3a812b962..c299f039b8 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 1bc8b9e0ef..e700b666bf 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -18,7 +18,6 @@ package bw6633
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -468,21 +451,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
+	weight float32
 
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
-
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 95312917d9..bee27d7123 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index e695c8791f..c81d3d8c0b 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -18,7 +18,6 @@ package bw6756
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -468,21 +451,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
+	weight float32
 
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
-
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index a94008b836..4fce6462a5 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index b89e06e566..d928a013fd 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -18,7 +18,6 @@ package bw6761
 
 import (
 	"errors"
-	"fmt"
 	"github.com/consensys/gnark-crypto/ecc"
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fr"
 	"github.com/consensys/gnark-crypto/internal/parallel"
@@ -141,24 +140,19 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
 		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -172,8 +166,7 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g1JacExtended, nbChunks)
@@ -183,10 +176,8 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -344,24 +335,19 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	switch c {
 
 	case 4:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
-		// fmt.Printf("jacobian \n")
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 16:
-		// const batchSize = 640
-		// status: we are losing in perf here in the nominal case.
-		// stat.deviation seems not good.
-		// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-		// if edgeCaseAffine {
-		// 	// fmt.Printf("jacobian \n")
-		// 	return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		// }
-		// fmt.Printf("affine \n")
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
 		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
 	default:
 		// panic("will not happen c != previous values is not generated by templates")
@@ -375,8 +361,7 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan g2JacExtended, nbChunks)
@@ -386,10 +371,8 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -468,21 +451,18 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
-	weight int
+	weight float32
 
-	// average absolute deviation. this is meant to give a sense of statistical
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int
+	// // average absolute deviation. this is meant to give a sense of statistical
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
-
-	// average ops per non-zero buckets
-	averageOpsPerBucket int
-}
+	// percentage of bucket filled in the window;
+	ppBucketFilled float32
+	nbBucketFilled int
 
-func (c *chunkStat) String() string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int
 }
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -571,74 +551,49 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v
-	// 	}
-	// 	return v
-	// }
-
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars) : (chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit&1 == 0 {
+					bucketID -= 1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1<<(c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps += stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 7d9abce749..d21931beca 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -222,38 +222,38 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G1Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G1Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG1(samplePoints[:])
 
 	var testPoint G1Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -263,19 +263,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
@@ -539,38 +539,38 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints             [nbSamples]G2Affine
-		sampleScalars            [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant   [nbSamples]fr.Element
+		samplePoints  [nbSamples]G2Affine
+		sampleScalars [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:], sampleScalars[:])
-	copy(sampleScalarsRedundant[:], sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i := 0; i < len(sampleScalarsSmallValues); i++ {
-		if i%5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
-		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBasesG2(samplePoints[:])
 
 	var testPoint G2Affine
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -580,19 +580,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index b9563c9a61..d1826081ce 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -14,7 +14,6 @@ import (
 	"errors"
 	"math"
 	"runtime"
-	"fmt"
 )
 
 {{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
@@ -57,22 +56,20 @@ func lastC(c uint64) uint64 {
 
 type chunkStat struct {
 	// relative weight of work compared to other chunks. 100.0 -> nominal weight. 
-	weight int
+	weight float32
 
-	// average absolute deviation. this is meant to give a sense of statistical 
-	// dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	deviation int 
+	// // average absolute deviation. this is meant to give a sense of statistical 
+	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
+	// deviation int 
 
-	// count the number of buckets that are non zeroes for this chunk
-	nonZeroBuckets int
+	// percentage of bucket filled in the window; 
+	ppBucketFilled float32
+	nbBucketFilled int 
 
-	// average ops per non-zero buckets
-	averageOpsPerBucket int 
+	// // average ops per non-zero buckets
+	// averageOpsPerBucket int 
 }
 
-func (c *chunkStat) String()  string {
-	return fmt.Sprintf("weight: %d, deviation: %d, nz: %d, averageOps: %d", c.weight, c.deviation, c.nonZeroBuckets, c.averageOpsPerBucket)
-}
 
 
 // partitionScalars  compute, for each scalars over c-bit wide windows, nbChunk digits
@@ -164,75 +161,50 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	}, nbTasks)
 	
-	// abs := func(v int) int {
-	// 	if v < 0 {
-	// 		return -v 
-	// 	}
-	// 	return v 
-	// }
-	
+
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// var opsPerBucket [1 << 15]int // max value is 16 for c
+			// indicates if a bucket is hit. 
+			var b bitSetC16 
+			
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)]
 
 			totalOps := 0
-			// nz := 0 // non zero buckets count
+			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
 					continue 
 				}
 				totalOps++
-				// bucketID := digit >> 1
-				// if digit &1 == 0 {
-				// 	bucketID-=1
-				// }
-				// if opsPerBucket[bucketID] == 0 {
-				// 	nz++
-				// }
-				// opsPerBucket[bucketID]++
+				bucketID := digit >> 1
+				if digit &1 == 0 {
+					bucketID-=1
+				}
+				if !b[bucketID] {
+					nz++
+					b[bucketID] = true
+				}
 			}
-			chunkStats[chunkID].weight = totalOps // count number of ops for now, we will compute the weight after
-			// chunkStats[chunkID].nonZeroBuckets = nz
-
-		
-			// if nz == 0 {
-			// 	return // no ops, only zeroes
-			// }
-
-			// bound := 1 << (c-1)
-			// if chunkID == int(nbChunks-1) {
-			// 	bound = 1 << (lastC(c)-1)
-			// }
-			// mean := totalOps / nz
-			// aad := 0
-			// averageOpsPerBucket := 0
-			// for b:=0; b < bound; b++ {
-			// 	if opsPerBucket[b] == 0 {
-			// 		continue 
-			// 	}
-			// 	aad += abs(opsPerBucket[b] - mean)
-			// 	averageOpsPerBucket += opsPerBucket[b]
-			// }
-			// chunkStats[chunkID].averageOpsPerBucket = averageOpsPerBucket / nz
-			// chunkStats[chunkID].deviation =  aad / nz
+			chunkStats[chunkID].weight = float32(totalOps) // count number of ops for now, we will compute the weight after
+			chunkStats[chunkID].ppBucketFilled = (float32(nz) * 100.0) / float32(int(1 << (c-1)))
+			chunkStats[chunkID].nbBucketFilled = nz
 		}
 	}, nbTasks)
 
-	totalOps := 0
+	totalOps := float32(0.0)
 	for _, stat := range chunkStats {
 		totalOps+=stat.weight
 	}
 
-	target := totalOps / int(nbChunks)
-	if target != 0 {
+	target := totalOps / float32(nbChunks)
+	if target != 0.0 {
 		// if target == 0, it means all the scalars are 0 everywhere, there is no work to be done.
 		for i := 0; i < len(chunkStats); i++ {
-			chunkStats[i].weight = (chunkStats[i].weight * 100) / target
+			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
 	
@@ -491,18 +463,15 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID
 		{{range $c :=  $.CRange}}
 		case {{$c}}:
 			{{- if le $c 9}}
-			// fmt.Printf("jacobian \n")
 				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
 			{{- else}}
-				// const batchSize = {{batchSize $c}}
-				// status: we are losing in perf here in the nominal case. 
-				// stat.deviation seems not good. 
-				// edgeCaseAffine := (batchSize > (stat.nonZeroBuckets / 10))  || (stat.deviation >= 20)
-				// if edgeCaseAffine {
-				// 	// fmt.Printf("jacobian \n")
-				// 	return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-				// }
-				// fmt.Printf("affine \n")
+				const batchSize = {{batchSize $c}}
+				// here we could check some chunk statistic (deviation, ...) to determine if calling
+				// the batch affine version is worth it. 
+				if stat.nbBucketFilled < batchSize {
+					// clear indicator that batch affine method is not appropriate here.
+					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+				}
 				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
 			{{- end}}
 		{{- end}}
@@ -518,8 +487,7 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
 	// corresponding bit-window
-	// note that buckets is an array allocated on the stack (for most sizes of c) and this is
-	// critical for performance
+	// note that buckets is an array allocated on the stack and this is critical for performance
 
 	// each go routine sends its result in chChunks[i] channel
 	chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks)
@@ -529,10 +497,8 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
-	// fmt.Printf("\n")
-	// fmt.Println("n", n)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %s -->", j, chunkStats[j].String())
+		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j])
 		if j == int(nbChunks - 1) {
 			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j])
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 91af5a996b..ed18ab1c46 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -237,37 +237,37 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	var (
 		samplePoints [nbSamples]{{ $.TAffine }}
 		sampleScalars [nbSamples]fr.Element
-		sampleScalarsSmallValues [nbSamples]fr.Element
-		sampleScalarsRedundant [nbSamples]fr.Element
+		// sampleScalarsSmallValues [nbSamples]fr.Element
+		// sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// this means first chunk is going to have more work to do and should be split into several go routines
-	for i:=0; i < len(sampleScalarsSmallValues);i++ {
-		if i % 5 == 0 {
-			sampleScalarsSmallValues[i].SetZero()
-			sampleScalarsSmallValues[i][0] = 1
-		}
-	}
-
-	// bad case for batch affine because scalar distribution might look uniform
-	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// to process small batches of additions to flush its queue of conflicted points.
-	for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-		for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-		}
-	}
+	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	// copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// // this means first chunk is going to have more work to do and should be split into several go routines
+	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
+	// 	if i % 5 == 0 {
+	// 		sampleScalarsSmallValues[i].SetZero()
+	// 		sampleScalarsSmallValues[i][0] = 1
+	// 	}
+	// }
+
+	// // bad case for batch affine because scalar distribution might look uniform
+	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// // to process small batches of additions to flush its queue of conflicted points.
+	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+	// 	}
+	// }
 
 	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
 
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 22; i <= pow; i++ {
+	for i := 15; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -277,19 +277,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 			}
 		})
 
-		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-			}
-		})
-
-		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-			b.ResetTimer()
-			for j := 0; j < b.N; j++ {
-				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-			}
-		})
+		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
+
+		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+		// 	b.ResetTimer()
+		// 	for j := 0; j < b.N; j++ {
+		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+		// 	}
+		// })
 	}
 }
 

From df5fcdf568fdf84fd5ed160667677e43cdc4a79c Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 11:00:16 -0600
Subject: [PATCH 31/43] style: added comments and clean msm

---
 ecc/bls12-377/multiexp_affine.go              | 136 ++++++++----------
 ecc/bls12-377/multiexp_test.go                |   8 +-
 ecc/bls12-378/multiexp_affine.go              | 136 ++++++++----------
 ecc/bls12-378/multiexp_test.go                |   8 +-
 ecc/bls12-381/multiexp_affine.go              | 136 ++++++++----------
 ecc/bls12-381/multiexp_test.go                |   8 +-
 ecc/bls24-315/multiexp_affine.go              | 136 ++++++++----------
 ecc/bls24-315/multiexp_test.go                |   8 +-
 ecc/bls24-317/multiexp_affine.go              | 136 ++++++++----------
 ecc/bls24-317/multiexp_test.go                |   8 +-
 ecc/bn254/multiexp_affine.go                  | 136 ++++++++----------
 ecc/bn254/multiexp_test.go                    |   8 +-
 ecc/bw6-633/multiexp_affine.go                | 136 ++++++++----------
 ecc/bw6-633/multiexp_test.go                  |   8 +-
 ecc/bw6-756/multiexp_affine.go                | 136 ++++++++----------
 ecc/bw6-756/multiexp_test.go                  |   8 +-
 ecc/bw6-761/multiexp_affine.go                | 136 ++++++++----------
 ecc/bw6-761/multiexp_test.go                  |   8 +-
 .../ecc/template/multiexp_affine.go.tmpl      |  75 ++++------
 .../ecc/template/tests/multiexp.go.tmpl       |   8 +-
 20 files changed, 571 insertions(+), 808 deletions(-)

diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index b2c167e9bc..1504538cf5 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 8a036b0f54..8700cfd9b3 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index af73df4882..8299397508 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 1e9ff1e4de..44c19874e4 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 992efc3fa4..b7af2292aa 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index bb1a3ac61e..946645ded0 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index ac3db048e6..6ef411f4b5 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index f44d8d7b81..27efcb0f13 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 4e241f79a5..a442f743ec 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index b3faa9e76f..95650ab5ca 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index a268d19c87..447d11c42c 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -23,13 +23,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -45,10 +39,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -63,13 +70,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -77,12 +81,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -95,7 +103,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -109,13 +116,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -127,14 +133,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -158,17 +161,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -190,10 +189,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -209,7 +206,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -218,21 +215,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -352,13 +342,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -374,10 +358,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -392,13 +389,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -406,12 +400,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -424,7 +422,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -438,13 +435,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -456,14 +452,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -487,17 +480,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -519,10 +508,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -538,7 +525,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -547,21 +534,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index c299f039b8..68e7b17e41 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index a10b6cbf76..493d750496 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -22,13 +22,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index bee27d7123..b059cbc98f 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 00e9053fd1..003b4678bd 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -22,13 +22,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 4fce6462a5..43529e1dbd 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 13f133daa5..c85ea75c3f 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -22,13 +22,7 @@ import (
 
 type batchOpG1Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G1Affine
-}
-
-func (o batchOpG1Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G1Affine
 }
 
 // processChunkG1BatchAffine process a chunk of the scalars during the msm
@@ -44,10 +38,23 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 	points []G1Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g1JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G1Affine used with the batch affine additions
+	// 1 in g1JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -62,13 +69,10 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG1Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG1Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -76,12 +80,16 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG1Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -94,7 +102,6 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -108,13 +115,12 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -126,14 +132,11 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -157,17 +160,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -189,10 +188,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -208,7 +205,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -217,21 +214,14 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g1JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
@@ -279,13 +269,7 @@ type qG1AffineC16 [640]batchOpG1Affine
 
 type batchOpG2Affine struct {
 	bucketID uint16
-	// pointID uint32
-	point G2Affine
-}
-
-func (o batchOpG2Affine) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
+	point    G2Affine
 }
 
 // processChunkG2BatchAffine process a chunk of the scalars during the msm
@@ -301,10 +285,23 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 	points []G2Affine,
 	digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// g2JacExtended coordinates.
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to
+	// non-batch-affine version.
+
+	// note that we have 2 sets of buckets
+	// 1 in G2Affine used with the batch affine additions
+	// 1 in g2JacExtended used in case the queue of conflicting points
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -319,13 +316,10 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		queue     TQ  // queue of points that conflict the current batch
 		qID       int // current position in queue
 	)
-	// var queue [batchSize]batchOpG2Affine
 
 	batchSize := len(P)
 
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool { return cptAdd == batchSize }
 
 	executeAndReset := func() {
 		batchAddG2Affine[TP, TPP, TC](&R, &P, cptAdd)
@@ -333,12 +327,16 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOpG2Affine) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID] {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity() {
 			BK.Set(&op.point)
 			return
 		}
@@ -351,7 +349,6 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -365,13 +362,12 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID] {
+		if BK.IsInfinity() {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -383,14 +379,11 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()
-					bucketSet[bucketID] = false
 				}
-
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -414,17 +407,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		qID = 0
 	}
 
-	processQueue := func() {
+	processTopQueue := func() {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
 			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -446,10 +435,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -465,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -474,21 +461,14 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total g2JacExtended
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k] {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index d21931beca..3fed44438a 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -669,11 +669,7 @@ func fillBenchBasesG2(samplePoints []G2Affine) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 6463bc1ede..72acde2cd5 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -14,29 +14,17 @@ import (
 	{{- end}}
 )
 
-
-
-
-
-
 {{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
 {{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
 
 
-
 {{define "multiexp" }}
 
 type batchOp{{ $.TAffine }} struct {
 	bucketID uint16
-	// pointID uint32
 	point {{ $.TAffine }}
 }
 
-func (o batchOp{{ $.TAffine }}) isNeg() bool {
-	return false
-	// return o.pointID&1 == 1
-}
-
 // processChunk{{ $.UPointName }}BatchAffine process a chunk of the scalars during the msm
 // using affine coordinates for the buckets. To amortize the cost of the inverse in the affine addition
 // we use a batch affine addition.
@@ -50,10 +38,23 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 	 points []{{ $.TAffine }},
 	 digits []uint16) {
 
-	// init the buckets
+	// the batch affine addition needs independent points; in other words, for a window of batchSize
+	// we want to hit independent bucketIDs when processing the digit. if there is a conflict (we're trying
+	// to add 2 different points to the same bucket), then we push the conflicted point to a queue.
+	// each time the batch is full, we execute it, and tentatively put the points (if not conflict)
+	// from the top of the queue into the next batch.
+	// if the queue is full, we "flush it"; we sequentially add the points to the buckets in
+	// {{ $.TJacobianExtended }} coordinates. 
+	// The reasoning behind this is the following; batchSize is chosen such as, for a uniformly random
+	// input, the number of conflicts is going to be low, and the element added to the queue should be immediatly
+	// processed in the next batch. If it's not the case, then our inputs are not random; and we fallback to 
+	// non-batch-affine version.
+	
+	// note that we have 2 sets of buckets
+	// 1 in {{ $.TAffine }} used with the batch affine additions
+	// 1 in {{ $.TJacobianExtended }} used in case the queue of conflicting points 
 	var buckets B
 	var bucketsJE BJE
-	var bucketSet BS
 	for i := 0; i < len(buckets); i++ {
 		buckets[i].setInfinity()
 		bucketsJE[i].setInfinity()
@@ -68,14 +69,10 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		queue TQ // queue of points that conflict the current batch
 		qID int // current position in queue
 	)
-	// var queue [batchSize]batchOp{{ $.TAffine}}
 	
 	batchSize := len(P)
 	
-	
-	isFull := func() bool {
-		return cptAdd == batchSize
-	}
+	isFull := func() bool {	return cptAdd == batchSize}
 
 	executeAndReset := func ()  {
 		batchAdd{{ $.TAffine }}[TP, TPP, TC](&R, &P, cptAdd)
@@ -83,12 +80,16 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		bucketIds = tmp
 		cptAdd = 0
 	}
+
 	addFromQueue := func(op batchOp{{ $.TAffine }}) {
-		// @precondition: ensures bucket is not "used" in current batch
+		// @precondition: must ensures bucket is not "used" in current batch
+		// note that there is a bit of duplicate logic between add and addFromQueue
+		// the reason is that as of Go 1.19.3, if we pass a pointer to the queue item (see add signature)
+		// the compiler will put the queue on the heap.
 		BK := &buckets[op.bucketID]
+
 		// handle special cases with inf or -P / P
-		if !bucketSet[op.bucketID]  {
-			bucketSet[op.bucketID] = true
+		if BK.IsInfinity()  {
 			BK.Set(&op.point)
 			return
 		}
@@ -101,7 +102,6 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 				return
 			}
 			BK.setInfinity()
-			bucketSet[op.bucketID] = false
 			return
 		}
 
@@ -115,13 +115,12 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		// @precondition: ensures bucket is not "used" in current batch
 		BK := &buckets[bucketID]
 		// handle special cases with inf or -P / P
-		if !bucketSet[bucketID]  {
+		if BK.IsInfinity()  {
 			if isAdd {
 				BK.Set(PP)
 			} else {
 				BK.Neg(PP)
 			}
-			bucketSet[bucketID] = true
 			return
 		}
 		if BK.X.Equal(&PP.X) {
@@ -133,14 +132,11 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 					BK.Add(BK, BK)
 				} else {
 					BK.setInfinity()	
-					bucketSet[bucketID] = false	
 				}
-				
 				return
 			}
 			if isAdd {
 				BK.setInfinity()
-				bucketSet[bucketID] = false
 			} else {
 				BK.Add(BK, BK)
 			}
@@ -164,17 +160,13 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		qID = 0
 	}
 
-	processQueue := func () {
+	processTopQueue := func () {
 		for i := qID - 1; i >= 0; i-- {
 			if bucketIds[queue[i].bucketID] {
 				return
 			}
-			addFromQueue(queue[i])
-			// add(queue[i].bucketID, &points[queue[i].pointID >> 1], !queue[i].isNeg())
-			// if isFull() {
-			// 	executeAndReset()
-			// }
-			// queue[i] = queue[qID-1]
+			addFromQueue(queue[i]) 
+			// len(queue) < batchSize so no need to check for full batch.
 			qID--
 		}
 	}
@@ -197,10 +189,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 			// put it in queue
 			queue[qID].bucketID = bucketID
 			if isAdd {
-				// queue[qID].pointID = uint32(i << 1)
 				queue[qID].point.Set(&points[i])	
 			} else {
-				// queue[qID].pointID = uint32(i << 1) + 1
 				queue[qID].point.Neg(&points[i])
 			}
 			qID++
@@ -216,7 +206,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		add(bucketID, &points[i], isAdd)
 		if isFull() {
 			executeAndReset()
-			processQueue() // TODO top queue only
+			processTopQueue()
 		}
 	}
 
@@ -226,22 +216,15 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 
 	// empty the queue
 	flushQueue()
-	// for qID != 0 {
-	// 	processQueue()
-	// 	executeAndReset()
-	// }
 
 
 	// reduce buckets into total
 	// total =  bucket[0] + 2*bucket[1] + 3*bucket[2] ... + n*bucket[n-1]
-
 	var runningSum, total {{ $.TJacobianExtended }}
 	runningSum.setInfinity()
 	total.setInfinity()
 	for k := len(buckets) - 1; k >= 0; k-- {
-		if bucketSet[k]  {
-			runningSum.addMixed(&buckets[k])
-		}
+		runningSum.addMixed(&buckets[k])
 		if !bucketsJE[k].ZZ.IsZero() {
 			runningSum.add(&bucketsJE[k])
 		}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index ed18ab1c46..cd2799bc6b 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -373,11 +373,7 @@ func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) {
 
 func fillBenchScalars(sampleScalars []fr.Element) {
 	// ensure every words of the scalars are filled
-	var mixer fr.Element
-	mixer.SetString("7716837800905789770901243404444209691916730933998574719964609384059111546487")
-	for i := 1; i <= len(sampleScalars); i++ {
-		sampleScalars[i-1].SetUint64(uint64(i)).
-			Mul(&sampleScalars[i-1], &mixer).
-			FromMont()
+	for i := 0; i < len(sampleScalars); i++ {
+		sampleScalars[i].SetRandom()
 	}
 }

From dc404e54032d5e5b3c18191c2940759af0e16ca1 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 11:55:01 -0600
Subject: [PATCH 32/43] fix: fix for small window size no need for stats

---
 ecc/bls12-377/multiexp.go                     | 12 +++++
 ecc/bls12-377/multiexp_affine.go              |  6 ++-
 ecc/bls12-377/multiexp_jacobian.go            | 16 +++---
 ecc/bls12-377/multiexp_test.go                |  6 +--
 ecc/bls12-378/multiexp.go                     | 12 +++++
 ecc/bls12-378/multiexp_affine.go              |  6 ++-
 ecc/bls12-378/multiexp_jacobian.go            | 16 +++---
 ecc/bls12-378/multiexp_test.go                |  6 +--
 ecc/bls12-381/multiexp.go                     | 12 +++++
 ecc/bls12-381/multiexp_affine.go              |  6 ++-
 ecc/bls12-381/multiexp_jacobian.go            |  8 +--
 ecc/bls12-381/multiexp_test.go                |  6 +--
 ecc/bls24-315/multiexp.go                     | 12 +++++
 ecc/bls24-315/multiexp_affine.go              |  6 ++-
 ecc/bls24-315/multiexp_jacobian.go            | 16 +++---
 ecc/bls24-315/multiexp_test.go                |  6 +--
 ecc/bls24-317/multiexp.go                     | 12 +++++
 ecc/bls24-317/multiexp_affine.go              |  6 ++-
 ecc/bls24-317/multiexp_jacobian.go            |  8 +--
 ecc/bls24-317/multiexp_test.go                |  6 +--
 ecc/bn254/multiexp.go                         | 12 +++++
 ecc/bn254/multiexp_affine.go                  |  6 ++-
 ecc/bn254/multiexp_jacobian.go                | 16 +++---
 ecc/bn254/multiexp_test.go                    |  6 +--
 ecc/bw6-633/multiexp.go                       | 30 ++++++++++-
 ecc/bw6-633/multiexp_affine.go                | 50 ++++++++++++++----
 ecc/bw6-633/multiexp_jacobian.go              | 12 ++---
 ecc/bw6-633/multiexp_test.go                  |  6 +--
 ecc/bw6-756/multiexp.go                       | 30 ++++++++++-
 ecc/bw6-756/multiexp_affine.go                | 50 ++++++++++++++----
 ecc/bw6-756/multiexp_jacobian.go              | 12 ++---
 ecc/bw6-756/multiexp_test.go                  |  6 +--
 ecc/bw6-761/multiexp.go                       | 34 +++++++++++-
 ecc/bw6-761/multiexp_affine.go                | 52 +++++++++++++++----
 ecc/bw6-761/multiexp_jacobian.go              | 16 +++---
 ecc/bw6-761/multiexp_test.go                  |  6 +--
 internal/generator/config/curve.go            | 20 +++----
 internal/generator/ecc/generate.go            | 17 ++++--
 .../generator/ecc/template/multiexp.go.tmpl   | 10 +++-
 .../ecc/template/multiexp_affine.go.tmpl      |  4 +-
 .../ecc/template/multiexp_jacobian.go.tmpl    | 10 +---
 .../ecc/template/tests/multiexp.go.tmpl       |  2 +-
 42 files changed, 433 insertions(+), 158 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index b87523e4e0..14fdaa8bc8 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 1504538cf5..83ce91c32a 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC1 [1 << (1 - 1)]bool
+type bitSetC2 [1 << (2 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC1 |
+		bitSetC2 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index e3c590196f..8fd4e382ff 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC1 |
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC1 |
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 8700cfd9b3..3681f4fc71 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 702015ccb8..7ca2a9edeb 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 8299397508..d8b54b76ca 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC2 [1 << (2 - 1)]bool
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC2 |
+		bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index 97a6ac8ac0..eb83e3c1c2 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC2 |
+	bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC2 |
+	bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 44c19874e4..cb553fad54 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 91e471a850..9de4ea488f 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index b7af2292aa..bfc282b553 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC1 [1 << (1 - 1)]bool
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC1 |
+		bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index 17139a4f22..bc304041f6 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,8 +76,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,8 +155,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 946645ded0..b58a70c951 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 46a8c8bd4f..1ca2222f9f 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 6ef411f4b5..4e679fea95 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC1 [1 << (1 - 1)]bool
+type bitSetC2 [1 << (2 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC1 |
+		bitSetC2 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 9f01ed9a7a..23310862df 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC2 |
-		bucketg1JacExtendedC1 |
+	bucketg1JacExtendedC1 |
+		bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC2 |
-		bucketg2JacExtendedC1 |
+	bucketg2JacExtendedC1 |
+		bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 27efcb0f13..54df2a76fa 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index d5232436d2..2120c3b479 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index a442f743ec..f657bf2bcf 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC1 [1 << (1 - 1)]bool
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC1 |
+		bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index c4fc41bc54..973219cc4b 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,8 +76,6 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,8 +155,6 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 95650ab5ca..0d15fed501 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 6c68b58cdd..b0fb67e9af 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -139,6 +139,10 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -394,6 +398,10 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -673,6 +681,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 447d11c42c..1f132b885e 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -659,6 +659,8 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC2 [1 << (2 - 1)]bool
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC6 [1 << (6 - 1)]bool
@@ -674,7 +676,9 @@ type bitSetC15 [1 << (15 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC2 |
+		bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
 		bitSetC7 |
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index 9eaccec8eb..a674d4f724 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -61,6 +61,8 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
@@ -74,12 +76,10 @@ type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
 type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
 type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
 type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC2 |
+	bucketg1JacExtendedC2 |
+		bucketg1JacExtendedC3 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,6 +140,8 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
@@ -153,12 +155,10 @@ type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
 type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
 type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
 type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC2 |
+	bucketg2JacExtendedC2 |
+		bucketg2JacExtendedC3 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 68e7b17e41..0fcdbce7bd 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index e700b666bf..c342b9a432 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 12, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -139,12 +139,23 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 12:
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -279,7 +290,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 12, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -334,12 +345,23 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 12:
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -553,6 +575,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 493d750496..949a53f642 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC16
+	bucketG1AffineC12 |
+		bucketG1AffineC16
 }
 
 // array of coordinates fp.Element
 type cG1Affine interface {
-	cG1AffineC16
+	cG1AffineC12 |
+		cG1AffineC16
 }
 
 // buckets: array of G1Affine points (for the batch addition)
 type pG1Affine interface {
-	pG1AffineC16
+	pG1AffineC12 |
+		pG1AffineC16
 }
 
 // buckets: array of *G1Affine points (for the batch addition)
 type ppG1Affine interface {
-	ppG1AffineC16
+	ppG1AffineC12 |
+		ppG1AffineC16
 }
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qG1AffineC16
+	qG1AffineC12 |
+		qG1AffineC16
 }
 
+// batch size 200 when c = 12
+type cG1AffineC12 [200]fp.Element
+type pG1AffineC12 [200]G1Affine
+type ppG1AffineC12 [200]*G1Affine
+type qG1AffineC12 [200]batchOpG1Affine
+
 // batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
@@ -481,47 +493,63 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC16
+	bucketG2AffineC12 |
+		bucketG2AffineC16
 }
 
 // array of coordinates fp.Element
 type cG2Affine interface {
-	cG2AffineC16
+	cG2AffineC12 |
+		cG2AffineC16
 }
 
 // buckets: array of G2Affine points (for the batch addition)
 type pG2Affine interface {
-	pG2AffineC16
+	pG2AffineC12 |
+		pG2AffineC16
 }
 
 // buckets: array of *G2Affine points (for the batch addition)
 type ppG2Affine interface {
-	ppG2AffineC16
+	ppG2AffineC12 |
+		ppG2AffineC16
 }
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qG2AffineC16
+	qG2AffineC12 |
+		qG2AffineC16
 }
 
+// batch size 200 when c = 12
+type cG2AffineC12 [200]fp.Element
+type pG2AffineC12 [200]G2Affine
+type ppG2AffineC12 [200]*G2Affine
+type qG2AffineC12 [200]batchOpG2Affine
+
 // batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC1 [1 << (1 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC12 [1 << (12 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC1 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC8 |
+		bitSetC12 |
 		bitSetC16
 }
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index d31a0eaf8c..497f2697fb 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -61,19 +61,19 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
 type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC12 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC12 |
 		bucketg1JacExtendedC16
 }
 
@@ -122,18 +122,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
 type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC12 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC12 |
 		bucketg2JacExtendedC16
 }
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index b059cbc98f..60444fc36e 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{1, 4, 5, 8, 12, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index c81d3d8c0b..72b83a7eac 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 11, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -139,12 +139,23 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 11:
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -279,7 +290,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 11, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -334,12 +345,23 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 11:
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -553,6 +575,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 003b4678bd..83cd6d1d61 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC16
+	bucketG1AffineC11 |
+		bucketG1AffineC16
 }
 
 // array of coordinates fp.Element
 type cG1Affine interface {
-	cG1AffineC16
+	cG1AffineC11 |
+		cG1AffineC16
 }
 
 // buckets: array of G1Affine points (for the batch addition)
 type pG1Affine interface {
-	pG1AffineC16
+	pG1AffineC11 |
+		pG1AffineC16
 }
 
 // buckets: array of *G1Affine points (for the batch addition)
 type ppG1Affine interface {
-	ppG1AffineC16
+	ppG1AffineC11 |
+		ppG1AffineC16
 }
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qG1AffineC16
+	qG1AffineC11 |
+		qG1AffineC16
 }
 
+// batch size 150 when c = 11
+type cG1AffineC11 [150]fp.Element
+type pG1AffineC11 [150]G1Affine
+type ppG1AffineC11 [150]*G1Affine
+type qG1AffineC11 [150]batchOpG1Affine
+
 // batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
@@ -481,47 +493,63 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC16
+	bucketG2AffineC11 |
+		bucketG2AffineC16
 }
 
 // array of coordinates fp.Element
 type cG2Affine interface {
-	cG2AffineC16
+	cG2AffineC11 |
+		cG2AffineC16
 }
 
 // buckets: array of G2Affine points (for the batch addition)
 type pG2Affine interface {
-	pG2AffineC16
+	pG2AffineC11 |
+		pG2AffineC16
 }
 
 // buckets: array of *G2Affine points (for the batch addition)
 type ppG2Affine interface {
-	ppG2AffineC16
+	ppG2AffineC11 |
+		ppG2AffineC16
 }
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qG2AffineC16
+	qG2AffineC11 |
+		qG2AffineC16
 }
 
+// batch size 150 when c = 11
+type cG2AffineC11 [150]fp.Element
+type pG2AffineC11 [150]G2Affine
+type ppG2AffineC11 [150]*G2Affine
+type qG2AffineC11 [150]batchOpG2Affine
+
 // batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC11 [1 << (11 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC8 |
+		bitSetC11 |
 		bitSetC16
 }
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 86ccb23bbc..93fd87fe51 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -61,19 +61,19 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC11 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC11 |
 		bucketg1JacExtendedC16
 }
 
@@ -122,18 +122,18 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC11 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC11 |
 		bucketg2JacExtendedC16
 }
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 43529e1dbd..e7244b2e97 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{3, 4, 5, 8, 11, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index d928a013fd..3f987c5bd2 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 10, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -139,12 +139,25 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 10:
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -279,7 +292,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 16}
+		implementedCs := []uint64{4, 5, 8, 10, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -334,12 +347,25 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 10:
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
 	case 16:
 		const batchSize = 640
 		// here we could check some chunk statistic (deviation, ...) to determine if calling
@@ -553,6 +579,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index c85ea75c3f..bfeea763cb 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -234,33 +234,45 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
 type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
-	bucketG1AffineC16
+	bucketG1AffineC10 |
+		bucketG1AffineC16
 }
 
 // array of coordinates fp.Element
 type cG1Affine interface {
-	cG1AffineC16
+	cG1AffineC10 |
+		cG1AffineC16
 }
 
 // buckets: array of G1Affine points (for the batch addition)
 type pG1Affine interface {
-	pG1AffineC16
+	pG1AffineC10 |
+		pG1AffineC16
 }
 
 // buckets: array of *G1Affine points (for the batch addition)
 type ppG1Affine interface {
-	ppG1AffineC16
+	ppG1AffineC10 |
+		ppG1AffineC16
 }
 
 // buckets: array of G1Affine queue operations (for the batch addition)
 type qOpsG1Affine interface {
-	qG1AffineC16
+	qG1AffineC10 |
+		qG1AffineC16
 }
 
+// batch size 80 when c = 10
+type cG1AffineC10 [80]fp.Element
+type pG1AffineC10 [80]G1Affine
+type ppG1AffineC10 [80]*G1Affine
+type qG1AffineC10 [80]batchOpG1Affine
+
 // batch size 640 when c = 16
 type cG1AffineC16 [640]fp.Element
 type pG1AffineC16 [640]G1Affine
@@ -481,47 +493,65 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
 type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
-	bucketG2AffineC16
+	bucketG2AffineC10 |
+		bucketG2AffineC16
 }
 
 // array of coordinates fp.Element
 type cG2Affine interface {
-	cG2AffineC16
+	cG2AffineC10 |
+		cG2AffineC16
 }
 
 // buckets: array of G2Affine points (for the batch addition)
 type pG2Affine interface {
-	pG2AffineC16
+	pG2AffineC10 |
+		pG2AffineC16
 }
 
 // buckets: array of *G2Affine points (for the batch addition)
 type ppG2Affine interface {
-	ppG2AffineC16
+	ppG2AffineC10 |
+		ppG2AffineC16
 }
 
 // buckets: array of G2Affine queue operations (for the batch addition)
 type qOpsG2Affine interface {
-	qG2AffineC16
+	qG2AffineC10 |
+		qG2AffineC16
 }
 
+// batch size 80 when c = 10
+type cG2AffineC10 [80]fp.Element
+type pG2AffineC10 [80]G2Affine
+type ppG2AffineC10 [80]*G2Affine
+type qG2AffineC10 [80]batchOpG2Affine
+
 // batch size 640 when c = 16
 type cG2AffineC16 [640]fp.Element
 type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
+type bitSetC2 [1 << (2 - 1)]bool
+type bitSetC3 [1 << (3 - 1)]bool
 type bitSetC4 [1 << (4 - 1)]bool
 type bitSetC5 [1 << (5 - 1)]bool
 type bitSetC8 [1 << (8 - 1)]bool
+type bitSetC10 [1 << (10 - 1)]bool
 type bitSetC16 [1 << (16 - 1)]bool
 
 type bitSet interface {
-	bitSetC4 |
+	bitSetC2 |
+		bitSetC3 |
+		bitSetC4 |
 		bitSetC5 |
 		bitSetC8 |
+		bitSetC10 |
 		bitSetC16
 }
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 3039c09d6c..59edd2d1bd 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
 type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
 type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
 type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
+type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC3 |
-		bucketg1JacExtendedC10 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
+		bucketg1JacExtendedC10 |
 		bucketg1JacExtendedC16
 }
 
@@ -124,20 +124,20 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
+type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
 type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
 type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
 type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
+type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC3 |
-		bucketg2JacExtendedC10 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
+		bucketg2JacExtendedC10 |
 		bucketg2JacExtendedC16
 }
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 3fed44438a..8354467f4b 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -99,7 +99,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 16}
+	cRange := []uint64{2, 3, 4, 5, 8, 10, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -253,7 +253,7 @@ func BenchmarkMultiExpG1(b *testing.B) {
 
 	var testPoint G1Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
@@ -570,7 +570,7 @@ func BenchmarkMultiExpG2(b *testing.B) {
 
 	var testPoint G2Affine
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {
diff --git a/internal/generator/config/curve.go b/internal/generator/config/curve.go
index 1ff4926ccf..e1e8bb9499 100644
--- a/internal/generator/config/curve.go
+++ b/internal/generator/config/curve.go
@@ -51,16 +51,16 @@ func (c Curve) Equal(other Curve) bool {
 }
 
 type Point struct {
-	CoordType          string
-	CoordExtDegree     uint8 // value n, such that q = pⁿ
-	CoordExtRoot       int64 // value a, such that the field is Fp[X]/(Xⁿ - a)
-	PointName          string
-	GLV                bool     // scalar multiplication using GLV
-	CofactorCleaning   bool     // flag telling if the Cofactor cleaning is available
-	CRange, LastCRange []int    // multiexp bucket method: generate inner methods (with const arrays) for each c
-	Projective         bool     // generate projective coordinates
-	A                  []string //A linear coefficient in Weierstrass form
-	B                  []string //B constant term in Weierstrass form
+	CoordType        string
+	CoordExtDegree   uint8 // value n, such that q = pⁿ
+	CoordExtRoot     int64 // value a, such that the field is Fp[X]/(Xⁿ - a)
+	PointName        string
+	GLV              bool     // scalar multiplication using GLV
+	CofactorCleaning bool     // flag telling if the Cofactor cleaning is available
+	CRange           []int    // multiexp bucket method: generate inner methods (with const arrays) for each c
+	Projective       bool     // generate projective coordinates
+	A                []string //A linear coefficient in Weierstrass form
+	B                []string //B constant term in Weierstrass form
 }
 
 var Curves []Curve
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index f77b9d5ca8..9af8b0dd15 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"path/filepath"
 	"reflect"
+	"sort"
 	"strings"
 	"text/template"
 
@@ -76,18 +77,26 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		}
 		return false
 	}
+	lastCG1 := make([]int, 0)
 	for i := 0; i < len(conf.G1.CRange); i++ {
 		lc := lastC(conf.G1.CRange[i])
-		if !contains(conf.G1.CRange, lc) && !contains(conf.G1.LastCRange, lc) {
-			conf.G1.LastCRange = append(conf.G1.LastCRange, lc)
+		if !contains(conf.G1.CRange, lc) && !contains(lastCG1, lc) {
+			lastCG1 = append(lastCG1, lc)
 		}
 	}
+	conf.G1.CRange = append(conf.G1.CRange, lastCG1...)
+	sort.Ints(conf.G1.CRange)
+
+	lastCG2 := make([]int, 0)
 	for i := 0; i < len(conf.G2.CRange); i++ {
 		lc := lastC(conf.G2.CRange[i])
-		if !contains(conf.G2.CRange, lc) && !contains(conf.G2.LastCRange, lc) {
-			conf.G2.LastCRange = append(conf.G2.LastCRange, lc)
+		if !contains(conf.G2.CRange, lc) && !contains(lastCG2, lc) {
+			lastCG2 = append(lastCG2, lc)
 		}
 	}
+	conf.G2.CRange = append(conf.G2.CRange, lastCG2...)
+	sort.Ints(conf.G2.CRange)
+
 	bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)}
 	if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil {
 		return err
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index d1826081ce..c8a4e6b453 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -164,6 +164,10 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
+	if c <= 9 {
+		// no need to compute stats for small window sizes
+		return digits, chunkStats
+	}
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
@@ -404,7 +408,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
 		implementedCs := []uint64{
-			{{- range $c :=  $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}}
+			{{- range $c :=  $.CRange}}{{- if ge $c 4}}{{$c}},{{- end}}{{- end}}
 		}
 		var C uint64
 		// approximate cost (in group operations)
@@ -460,6 +464,10 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 
 func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
 	switch c {
+		{{- range $c :=  $.LastCRange}}
+		case {{$c}}:
+			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+		{{- end }}
 		{{range $c :=  $.CRange}}
 		case {{$c}}:
 			{{- if le $c 9}}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 72acde2cd5..a3c609910f 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -14,8 +14,8 @@ import (
 	{{- end}}
 )
 
-{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
-{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
+{{ template "multiexp" dict "CoordType" .G1.CoordType "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+{{ template "multiexp" dict "CoordType" .G2.CoordType "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
 
 
 {{define "multiexp" }}
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index 7aaec9f186..3fd44311bc 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -8,8 +8,8 @@
 
 
 
-{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange "LastCRange" .G1.LastCRange}}
-{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange "LastCRange" .G2.LastCRange}}
+{{ template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange }}
+{{ template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange }}
 
 
 
@@ -66,14 +66,8 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 {{- range $c :=  $.CRange}}
 type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
 {{- end}}
-{{- range $c :=  $.LastCRange}}
-type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
-{{- end}}
 
 type ib{{ $.TJacobianExtended }} interface {
-	{{- range $i, $c :=  $.LastCRange}}
-	bucket{{ $.TJacobianExtended }}C{{$c}} |
-	{{- end}}
 	{{- range $i, $c :=  $.CRange}}
 	bucket{{ $.TJacobianExtended }}C{{$c}} {{- if not (last $i $.CRange)}} | {{- end}}
 	{{- end}}
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index cd2799bc6b..dcba38d621 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -267,7 +267,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 
 	var testPoint {{ $.TAffine }}
 
-	for i := 15; i <= pow; i++ {
+	for i := 5; i <= pow; i++ {
 		using := 1 << i
 
 		b.Run(fmt.Sprintf("%d points", using), func(b *testing.B) {

From e76541e62e6ffd6c22987545437acb868f9caaae Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 14:33:00 -0600
Subject: [PATCH 33/43] test: restore test for all C

---
 ecc/bls12-377/multiexp.go                     | 188 +++++++++---------
 ecc/bls12-377/multiexp_test.go                | 179 +++++++++--------
 ecc/bls12-378/multiexp.go                     | 188 +++++++++---------
 ecc/bls12-378/multiexp_test.go                | 179 +++++++++--------
 ecc/bls12-381/multiexp.go                     | 188 +++++++++---------
 ecc/bls12-381/multiexp_test.go                | 179 +++++++++--------
 ecc/bls24-315/multiexp.go                     | 188 +++++++++---------
 ecc/bls24-315/multiexp_test.go                | 179 +++++++++--------
 ecc/bls24-317/multiexp.go                     | 188 +++++++++---------
 ecc/bls24-317/multiexp_test.go                | 179 +++++++++--------
 ecc/bn254/multiexp.go                         | 188 +++++++++---------
 ecc/bn254/multiexp_test.go                    | 179 +++++++++--------
 ecc/bw6-633/multiexp.go                       | 164 +++++++--------
 ecc/bw6-633/multiexp_test.go                  | 179 +++++++++--------
 ecc/bw6-756/multiexp.go                       | 164 +++++++--------
 ecc/bw6-756/multiexp_test.go                  | 179 +++++++++--------
 ecc/bw6-761/multiexp.go                       | 172 ++++++++--------
 ecc/bw6-761/multiexp_test.go                  | 179 +++++++++--------
 go.mod                                        |   2 +-
 go.sum                                        |   4 +-
 .../generator/ecc/template/multiexp.go.tmpl   |  67 +++----
 .../ecc/template/tests/multiexp.go.tmpl       |  92 ++++-----
 22 files changed, 1716 insertions(+), 1688 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 14fdaa8bc8..7e519245cc 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 3681f4fc71..4510933ea1 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 7ca2a9edeb..63476d87c9 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index cb553fad54..5bef450dc0 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 9de4ea488f..4a389f9e4f 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index b58a70c951..fdd7809b56 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 1ca2222f9f..9b3b5eb0d8 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 54df2a76fa..bdb2e0167a 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 2120c3b479..6b98c52875 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 0d15fed501..48420037d0 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index b0fb67e9af..791fc0c19e 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -128,14 +128,58 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG1(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG1(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g1JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
@@ -224,50 +268,6 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g1JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g1JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG1(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g1JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -387,14 +387,58 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
+	_innerMsmG2(p, C, points, scalars, config)
+
+	return p, nil
+}
+
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
 	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	nbChunks := computeNbChunks(c)
 
-	return p, nil
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := getChunkProcessorG2(c, chunkStats[j])
+		if j == int(nbChunks-1) {
+			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
+		}
+		if chunkStats[j].weight >= 115 {
+			// we split this in more go routines since this chunk has more work to do than the others.
+			// else what would happen is this go routine would finish much later than the others.
+			chSplit := make(chan g2JacExtended, 2)
+			split := n / 2
+			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
+			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
+			go func(chunkID int) {
+				s1 := <-chSplit
+				s2 := <-chSplit
+				close(chSplit)
+				s1.add(&s2)
+				chChunks[chunkID] <- s1
+			}(j)
+			continue
+		}
+		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
@@ -483,50 +527,6 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 	}
 }
 
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
-
-	nbChunks := computeNbChunks(c)
-
-	// for each chunk, spawn one go routine that'll loop through all the scalars in the
-	// corresponding bit-window
-	// note that buckets is an array allocated on the stack and this is critical for performance
-
-	// each go routine sends its result in chChunks[i] channel
-	chChunks := make([]chan g2JacExtended, nbChunks)
-	for i := 0; i < len(chChunks); i++ {
-		chChunks[i] = make(chan g2JacExtended, 1)
-	}
-
-	// the last chunk may be processed with a different method than the rest, as it could be smaller.
-	n := len(points)
-	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
-		processChunk := getChunkProcessorG2(c, chunkStats[j])
-		if j == int(nbChunks-1) {
-			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
-		}
-		if chunkStats[j].weight >= 115 {
-			// we split this in more go routines since this chunk has more work to do than the others.
-			// else what would happen is this go routine would finish much later than the others.
-			chSplit := make(chan g2JacExtended, 2)
-			split := n / 2
-			go processChunk(uint64(j), chSplit, c, points[:split], digits[j*n:(j*n)+split])
-			go processChunk(uint64(j), chSplit, c, points[split:], digits[(j*n)+split:(j+1)*n])
-			go func(chunkID int) {
-				s1 := <-chSplit
-				s2 := <-chSplit
-				close(chSplit)
-				s1.add(&s2)
-				chChunks[chunkID] <- s1
-			}(j)
-			continue
-		}
-		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
-	}
-
-	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
-}
-
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 0fcdbce7bd..00cd01348e 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index c342b9a432..eeb7d4e43a 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -128,50 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	switch c {
-
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
-	case 4:
-		return processChunkG1Jacobian[bucketg1JacExtendedC4]
-	case 5:
-		return processChunkG1Jacobian[bucketg1JacExtendedC5]
-	case 8:
-		return processChunkG1Jacobian[bucketg1JacExtendedC8]
-	case 12:
-		const batchSize = 200
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC12]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG1Jacobian[bucketg1JacExtendedC16]
-	}
-}
-
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -188,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -215,6 +178,43 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	switch c {
+
+	case 1:
+		return processChunkG1Jacobian[bucketg1JacExtendedC1]
+	case 4:
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
+	case 5:
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
+	case 8:
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 12:
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC12]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC12, bucketG1AffineC12, bitSetC12, pG1AffineC12, ppG1AffineC12, qG1AffineC12, cG1AffineC12]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -334,50 +334,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	switch c {
-
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
-	case 4:
-		return processChunkG2Jacobian[bucketg2JacExtendedC4]
-	case 5:
-		return processChunkG2Jacobian[bucketg2JacExtendedC5]
-	case 8:
-		return processChunkG2Jacobian[bucketg2JacExtendedC8]
-	case 12:
-		const batchSize = 200
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC12]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG2Jacobian[bucketg2JacExtendedC16]
-	}
-}
-
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -394,7 +358,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -421,6 +384,43 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	switch c {
+
+	case 1:
+		return processChunkG2Jacobian[bucketg2JacExtendedC1]
+	case 4:
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
+	case 5:
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
+	case 8:
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 12:
+		const batchSize = 200
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC12]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC12, bucketG2AffineC12, bitSetC12, pG2AffineC12, ppG2AffineC12, qG2AffineC12, cG2AffineC12]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 60444fc36e..6572865155 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 72b83a7eac..35135f2959 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -128,50 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	switch c {
-
-	case 3:
-		return processChunkG1Jacobian[bucketg1JacExtendedC3]
-	case 4:
-		return processChunkG1Jacobian[bucketg1JacExtendedC4]
-	case 5:
-		return processChunkG1Jacobian[bucketg1JacExtendedC5]
-	case 8:
-		return processChunkG1Jacobian[bucketg1JacExtendedC8]
-	case 11:
-		const batchSize = 150
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC11]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG1Jacobian[bucketg1JacExtendedC16]
-	}
-}
-
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -188,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -215,6 +178,43 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	switch c {
+
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
+	case 4:
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
+	case 5:
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
+	case 8:
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 11:
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC11]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC11, bucketG1AffineC11, bitSetC11, pG1AffineC11, ppG1AffineC11, qG1AffineC11, cG1AffineC11]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -334,50 +334,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	switch c {
-
-	case 3:
-		return processChunkG2Jacobian[bucketg2JacExtendedC3]
-	case 4:
-		return processChunkG2Jacobian[bucketg2JacExtendedC4]
-	case 5:
-		return processChunkG2Jacobian[bucketg2JacExtendedC5]
-	case 8:
-		return processChunkG2Jacobian[bucketg2JacExtendedC8]
-	case 11:
-		const batchSize = 150
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC11]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG2Jacobian[bucketg2JacExtendedC16]
-	}
-}
-
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -394,7 +358,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -421,6 +384,43 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	switch c {
+
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
+	case 4:
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
+	case 5:
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
+	case 8:
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 11:
+		const batchSize = 150
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC11]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC11, bucketG2AffineC11, bitSetC11, pG2AffineC11, ppG2AffineC11, qG2AffineC11, cG2AffineC11]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index e7244b2e97..f6d3a94ca9 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 3f987c5bd2..5037c2e0be 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -128,52 +128,14 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG1(p, C, points, digits, chunkStats)
+	_innerMsmG1(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
-	switch c {
-
-	case 2:
-		return processChunkG1Jacobian[bucketg1JacExtendedC2]
-	case 3:
-		return processChunkG1Jacobian[bucketg1JacExtendedC3]
-	case 4:
-		return processChunkG1Jacobian[bucketg1JacExtendedC4]
-	case 5:
-		return processChunkG1Jacobian[bucketg1JacExtendedC5]
-	case 8:
-		return processChunkG1Jacobian[bucketg1JacExtendedC8]
-	case 10:
-		const batchSize = 80
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC10]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG1Jacobian[bucketg1JacExtendedC16]
-		}
-		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG1Jacobian[bucketg1JacExtendedC16]
-	}
-}
-
-func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkStats []chunkStat) *G1Jac {
+func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -190,7 +152,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG1(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG1(lastC(c), chunkStats[j])
@@ -217,6 +178,45 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, digits []uint16, chunkSt
 	return msmReduceChunkG1Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG1 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
+	switch c {
+
+	case 2:
+		return processChunkG1Jacobian[bucketg1JacExtendedC2]
+	case 3:
+		return processChunkG1Jacobian[bucketg1JacExtendedC3]
+	case 4:
+		return processChunkG1Jacobian[bucketg1JacExtendedC4]
+	case 5:
+		return processChunkG1Jacobian[bucketg1JacExtendedC5]
+	case 8:
+		return processChunkG1Jacobian[bucketg1JacExtendedC8]
+	case 10:
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC10]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC10, bucketG1AffineC10, bitSetC10, pG1AffineC10, ppG1AffineC10, qG1AffineC10, cG1AffineC10]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG1Jacobian[bucketg1JacExtendedC16]
+		}
+		return processChunkG1BatchAffine[bucketg1JacExtendedC16, bucketG1AffineC16, bitSetC16, pG1AffineC16, ppG1AffineC16, qG1AffineC16, cG1AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG1Jacobian[bucketg1JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG1Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1Jac {
 	var _p g1JacExtended
@@ -336,52 +336,14 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsmG2(p, C, points, digits, chunkStats)
+	_innerMsmG2(p, C, points, scalars, config)
 
 	return p, nil
 }
 
-func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
-	switch c {
-
-	case 2:
-		return processChunkG2Jacobian[bucketg2JacExtendedC2]
-	case 3:
-		return processChunkG2Jacobian[bucketg2JacExtendedC3]
-	case 4:
-		return processChunkG2Jacobian[bucketg2JacExtendedC4]
-	case 5:
-		return processChunkG2Jacobian[bucketg2JacExtendedC5]
-	case 8:
-		return processChunkG2Jacobian[bucketg2JacExtendedC8]
-	case 10:
-		const batchSize = 80
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC10]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
-	case 16:
-		const batchSize = 640
-		// here we could check some chunk statistic (deviation, ...) to determine if calling
-		// the batch affine version is worth it.
-		if stat.nbBucketFilled < batchSize {
-			// clear indicator that batch affine method is not appropriate here.
-			return processChunkG2Jacobian[bucketg2JacExtendedC16]
-		}
-		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
-	default:
-		// panic("will not happen c != previous values is not generated by templates")
-		return processChunkG2Jacobian[bucketg2JacExtendedC16]
-	}
-}
-
-func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkStats []chunkStat) *G2Jac {
+func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
 	nbChunks := computeNbChunks(c)
 
@@ -398,7 +360,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessorG2(c, chunkStats[j])
 		if j == int(nbChunks-1) {
 			processChunk = getChunkProcessorG2(lastC(c), chunkStats[j])
@@ -425,6 +386,45 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, digits []uint16, chunkSt
 	return msmReduceChunkG2Affine(p, int(c), chChunks[:])
 }
 
+// getChunkProcessorG2 decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
+	switch c {
+
+	case 2:
+		return processChunkG2Jacobian[bucketg2JacExtendedC2]
+	case 3:
+		return processChunkG2Jacobian[bucketg2JacExtendedC3]
+	case 4:
+		return processChunkG2Jacobian[bucketg2JacExtendedC4]
+	case 5:
+		return processChunkG2Jacobian[bucketg2JacExtendedC5]
+	case 8:
+		return processChunkG2Jacobian[bucketg2JacExtendedC8]
+	case 10:
+		const batchSize = 80
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC10]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC10, bucketG2AffineC10, bitSetC10, pG2AffineC10, ppG2AffineC10, qG2AffineC10, cG2AffineC10]
+	case 16:
+		const batchSize = 640
+		// here we could check some chunk statistic (deviation, ...) to determine if calling
+		// the batch affine version is worth it.
+		if stat.nbBucketFilled < batchSize {
+			// clear indicator that batch affine method is not appropriate here.
+			return processChunkG2Jacobian[bucketg2JacExtendedC16]
+		}
+		return processChunkG2BatchAffine[bucketg2JacExtendedC16, bucketG2AffineC16, bitSetC16, pG2AffineC16, ppG2AffineC16, qG2AffineC16, cG2AffineC16]
+	default:
+		// panic("will not happen c != previous values is not generated by templates")
+		return processChunkG2Jacobian[bucketg2JacExtendedC16]
+	}
+}
+
 // msmReduceChunkG2Affine reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2Jac {
 	var _p g2JacExtended
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 8354467f4b..83f5b41bbd 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -21,6 +21,7 @@ import (
 	"math/big"
 	"math/bits"
 	"math/rand"
+	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -35,9 +36,9 @@ func TestMultiExpG1(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -125,9 +126,8 @@ func TestMultiExpG1(t *testing.T) {
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -159,12 +159,14 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G1Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -222,32 +224,32 @@ func BenchmarkMultiExpG1(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G1Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G1Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG1(samplePoints[:])
 
@@ -263,19 +265,19 @@ func BenchmarkMultiExpG1(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
@@ -354,9 +356,9 @@ func TestMultiExpG2(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 	properties := gopter.NewProperties(parameters)
@@ -442,9 +444,8 @@ func TestMultiExpG2(t *testing.T) {
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp(samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -476,12 +477,14 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i%10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]G2Jac, len(cRange))
-			for i := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -539,32 +542,32 @@ func BenchmarkMultiExpG2(b *testing.B) {
 	)
 
 	var (
-		samplePoints  [nbSamples]G2Affine
-		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		samplePoints             [nbSamples]G2Affine
+		sampleScalars            [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant   [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:], sampleScalars[:])
+	copy(sampleScalarsRedundant[:], sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i := 0; i < len(sampleScalarsSmallValues); i++ {
+		if i%5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i := 0; i < len(sampleScalarsRedundant); i += 100 {
+		for j := i + 1; j < i+100 && j < len(sampleScalarsRedundant); j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBasesG2(samplePoints[:])
 
@@ -580,19 +583,19 @@ func BenchmarkMultiExpG2(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using], ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using], ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 
diff --git a/go.mod b/go.mod
index f1fd1fb56c..fe822ccf53 100644
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,7 @@ require (
 	github.com/spf13/cobra v1.5.0
 	github.com/stretchr/testify v1.8.0
 	golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa
-	golang.org/x/sys v0.0.0-20220727055044-e65921a090b8
+	golang.org/x/sys v0.2.0
 )
 
 require (
diff --git a/go.sum b/go.sum
index a0175604ce..b3ae5f84f9 100644
--- a/go.sum
+++ b/go.sum
@@ -26,8 +26,8 @@ github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PK
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa h1:zuSxTR4o9y82ebqCUJYNGJbGPo6sKVl54f/TVDObg1c=
 golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
-golang.org/x/sys v0.0.0-20220727055044-e65921a090b8 h1:dyU22nBWzrmTQxtNrr4dzVOvaw35nUYE279vF9UmsI8=
-golang.org/x/sys v0.0.0-20220727055044-e65921a090b8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A=
+golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index c8a4e6b453..13e0171a3c 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -453,44 +453,15 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		}
 	}
 
-	// partition the scalars
-	digits, chunkStats := partitionScalars(scalars, C, config.ScalarsMont, config.NbTasks)
-
-	_innerMsm{{ $.UPointName }}(p, C, points, digits, chunkStats)
+	_innerMsm{{ $.UPointName }}(p, C, points, scalars, config)
 
 	return p, nil
 }
 
+func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) *{{ $.TJacobian }} {
+	// partition the scalars
+	digits, chunkStats := partitionScalars(scalars, c, config.ScalarsMont, config.NbTasks)
 
-func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
-	switch c {
-		{{- range $c :=  $.LastCRange}}
-		case {{$c}}:
-			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-		{{- end }}
-		{{range $c :=  $.CRange}}
-		case {{$c}}:
-			{{- if le $c 9}}
-				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-			{{- else}}
-				const batchSize = {{batchSize $c}}
-				// here we could check some chunk statistic (deviation, ...) to determine if calling
-				// the batch affine version is worth it. 
-				if stat.nbBucketFilled < batchSize {
-					// clear indicator that batch affine method is not appropriate here.
-					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
-				}
-				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
-			{{- end}}
-		{{- end}}
-		default:
-			// panic("will not happen c != previous values is not generated by templates")
-			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16]
-	}
-}
-
-func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.TAffine }}, digits []uint16, chunkStats []chunkStat) *{{ $.TJacobian }} {
-	
 	nbChunks := computeNbChunks(c)
 
 	// for each chunk, spawn one go routine that'll loop through all the scalars in the
@@ -506,7 +477,6 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 	// the last chunk may be processed with a different method than the rest, as it could be smaller.
 	n := len(points)
 	for j := int(nbChunks - 1); j >= 0; j-- {
-		// fmt.Printf("chunk[%d]: %f %f \n", j, chunkStats[j].weight, chunkStats[j].ppBucketFilled)
 		processChunk := getChunkProcessor{{ $.UPointName }}(c, chunkStats[j])
 		if j == int(nbChunks - 1) {
 			processChunk = getChunkProcessor{{ $.UPointName }}(lastC(c), chunkStats[j])
@@ -534,6 +504,35 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 }
 
 
+// getChunkProcessor{{ $.UPointName }} decides, depending on c window size and statistics for the chunk
+// to return the best algorithm to process the chunk.
+func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- {{ $.TJacobianExtended }}, c uint64, points []{{ $.TAffine }}, digits []uint16) {
+	switch c {
+		{{- range $c :=  $.LastCRange}}
+		case {{$c}}:
+			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+		{{- end }}
+		{{range $c :=  $.CRange}}
+		case {{$c}}:
+			{{- if le $c 9}}
+				return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+			{{- else}}
+				const batchSize = {{batchSize $c}}
+				// here we could check some chunk statistic (deviation, ...) to determine if calling
+				// the batch affine version is worth it. 
+				if stat.nbBucketFilled < batchSize {
+					// clear indicator that batch affine method is not appropriate here.
+					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
+				}
+				return processChunk{{ $.UPointName }}BatchAffine[bucket{{ $.TJacobianExtended }}C{{$c}}, bucket{{ $.TAffine }}C{{$c}}, bitSetC{{$c}}, p{{$.TAffine}}C{{$c}}, pp{{$.TAffine}}C{{$c}}, q{{$.TAffine}}C{{$c}}, c{{$.TAffine}}C{{$c}}]
+			{{- end}}
+		{{- end}}
+		default:
+			// panic("will not happen c != previous values is not generated by templates")
+			return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16]
+	}
+}
+
 
 // msmReduceChunk{{ $.TAffine }} reduces the weighted sum of the buckets into the result of the multiExp
 func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan {{ $.TJacobianExtended }})  *{{ $.TJacobian }} {
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index dcba38d621..5079d33d3b 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -10,6 +10,7 @@
 import (
 	"fmt"
     "time"
+	"runtime"
     "math/rand"
 	"math/big"
 	"testing"
@@ -23,8 +24,8 @@ import (
 )
 
 
-{{template "multiexp" dict "PointName" .G1.PointName "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
-{{template "multiexp" dict "PointName" .G2.PointName "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
+{{template "multiexp" dict "PointName" .G1.PointName "UPointName" (toUpper .G1.PointName) "TAffine" $G1TAffine "TJacobian" $G1TJacobian "TJacobianExtended" $G1TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G1.CRange}}
+{{template "multiexp" dict "PointName" .G2.PointName "UPointName" (toUpper .G2.PointName) "TAffine" $G2TAffine "TJacobian" $G2TJacobian "TJacobianExtended" $G2TJacobianExtended "FrNbWords" .Fr.NbWords "CRange" .G2.CRange}}
 
 {{define "multiexp" }}
 
@@ -32,9 +33,9 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
-		parameters.MinSuccessfulTests = 2
+		parameters.MinSuccessfulTests = 3
 	} else {
-		parameters.MinSuccessfulTests = nbFuzzShort
+		parameters.MinSuccessfulTests = nbFuzzShort * 2
 	}
 
 
@@ -100,7 +101,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	{{- if eq $.PointName "g1" }}
 	cRange := []uint64{
-		{{- range $c :=  $.CRange}} {{- if and (eq $.PointName "g1") (gt $c 21)}}{{- else}} {{$c}},{{- end}}{{- end}}
+		{{- range $c :=  $.CRange}}{{$c}},{{- end}}
 	}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
@@ -133,9 +134,8 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 
 
 			results := make([]{{ $.TJacobian }}, len(cRange))
-			for i, _ := range cRange {
-				// TODO @gbotrel restore test of all C
-				results[i].MultiExp( samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i:=1; i < len(results);i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -167,12 +167,14 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
+				if i % 10 == 0 {
+					samplePointsZero[i].setInfinity()
+				}
 			}
 
 			results := make([]{{ $.TJacobian }}, len(cRange))
-			for i, _ := range cRange {
-				// TODO @gbotrel restore test for all C
-				results[i].MultiExp(samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{})
+			for i, c := range cRange {
+				_innerMsm{{ $.UPointName }}(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
@@ -237,30 +239,30 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 	var (
 		samplePoints [nbSamples]{{ $.TAffine }}
 		sampleScalars [nbSamples]fr.Element
-		// sampleScalarsSmallValues [nbSamples]fr.Element
-		// sampleScalarsRedundant [nbSamples]fr.Element
+		sampleScalarsSmallValues [nbSamples]fr.Element
+		sampleScalarsRedundant [nbSamples]fr.Element
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	// copy(sampleScalarsSmallValues[:],sampleScalars[:])
-	// copy(sampleScalarsRedundant[:],sampleScalars[:])
-
-	// // this means first chunk is going to have more work to do and should be split into several go routines
-	// for i:=0; i < len(sampleScalarsSmallValues);i++ {
-	// 	if i % 5 == 0 {
-	// 		sampleScalarsSmallValues[i].SetZero()
-	// 		sampleScalarsSmallValues[i][0] = 1
-	// 	}
-	// }
-
-	// // bad case for batch affine because scalar distribution might look uniform
-	// // but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
-	// // to process small batches of additions to flush its queue of conflicted points.
-	// for i:=0; i < len(sampleScalarsRedundant);i+=100 {
-	// 	for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
-	// 		sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
-	// 	}
-	// }
+	copy(sampleScalarsSmallValues[:],sampleScalars[:])
+	copy(sampleScalarsRedundant[:],sampleScalars[:])
+
+	// this means first chunk is going to have more work to do and should be split into several go routines
+	for i:=0; i < len(sampleScalarsSmallValues);i++ {
+		if i % 5 == 0 {
+			sampleScalarsSmallValues[i].SetZero()
+			sampleScalarsSmallValues[i][0] = 1
+		}
+	}
+
+	// bad case for batch affine because scalar distribution might look uniform
+	// but over batchSize windows, we may hit a lot of conflicts and force the msm-affine
+	// to process small batches of additions to flush its queue of conflicted points.
+	for i:=0; i < len(sampleScalarsRedundant);i+=100 {
+		for j:=i+1; j < i+100 && j < len(sampleScalarsRedundant);j++ {
+			sampleScalarsRedundant[j] = sampleScalarsRedundant[i]
+		}
+	}
 
 	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
 
@@ -277,19 +279,19 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 			}
 		})
 
-		// b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
-
-		// b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
-		// 	b.ResetTimer()
-		// 	for j := 0; j < b.N; j++ {
-		// 		testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
-		// 	}
-		// })
+		b.Run(fmt.Sprintf("%d points-smallvalues", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsSmallValues[:using],ecc.MultiExpConfig{})
+			}
+		})
+
+		b.Run(fmt.Sprintf("%d points-redundancy", using), func(b *testing.B) {
+			b.ResetTimer()
+			for j := 0; j < b.N; j++ {
+				testPoint.MultiExp(samplePoints[:using], sampleScalarsRedundant[:using],ecc.MultiExpConfig{})
+			}
+		})
 	}
 }
 

From 2a8d8e6d7b27382dd3d66cc18d2bbfa2e612892f Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 14:41:28 -0600
Subject: [PATCH 34/43] test: restore bench batchadd

---
 ecc/bls12-377/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-377/g2_test.go                      | 41 +++++++++---------
 ecc/bls12-378/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-378/g2_test.go                      | 41 +++++++++---------
 ecc/bls12-381/g1_test.go                      | 41 +++++++++---------
 ecc/bls12-381/g2_test.go                      | 41 +++++++++---------
 ecc/bls24-315/g1_test.go                      | 41 +++++++++---------
 ecc/bls24-315/g2_test.go                      | 41 +++++++++---------
 ecc/bls24-317/g1_test.go                      | 41 +++++++++---------
 ecc/bls24-317/g2_test.go                      | 41 +++++++++---------
 ecc/bn254/g1_test.go                          | 41 +++++++++---------
 ecc/bn254/g2_test.go                          | 41 +++++++++---------
 ecc/bw6-633/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-633/g2_test.go                        | 41 +++++++++---------
 ecc/bw6-756/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-756/g2_test.go                        | 41 +++++++++---------
 ecc/bw6-761/g1_test.go                        | 41 +++++++++---------
 ecc/bw6-761/g2_test.go                        | 41 +++++++++---------
 .../ecc/template/tests/point.go.tmpl          | 43 ++++++++++---------
 19 files changed, 400 insertions(+), 381 deletions(-)

diff --git a/ecc/bls12-377/g1_test.go b/ecc/bls12-377/g1_test.go
index eb09d3cca4..0dffff841c 100644
--- a/ecc/bls12-377/g1_test.go
+++ b/ecc/bls12-377/g1_test.go
@@ -19,6 +19,7 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-377/g2_test.go b/ecc/bls12-377/g2_test.go
index c0653c32af..25bcb60ee4 100644
--- a/ecc/bls12-377/g2_test.go
+++ b/ecc/bls12-377/g2_test.go
@@ -19,6 +19,7 @@ package bls12377
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-377/internal/fptower"
@@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-378/g1_test.go b/ecc/bls12-378/g1_test.go
index 6752818d29..70c9dc5603 100644
--- a/ecc/bls12-378/g1_test.go
+++ b/ecc/bls12-378/g1_test.go
@@ -19,6 +19,7 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-378/g2_test.go b/ecc/bls12-378/g2_test.go
index a9632dc413..ef146f5247 100644
--- a/ecc/bls12-378/g2_test.go
+++ b/ecc/bls12-378/g2_test.go
@@ -19,6 +19,7 @@ package bls12378
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-378/internal/fptower"
@@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-381/g1_test.go b/ecc/bls12-381/g1_test.go
index 223c3763c0..ce531be7ae 100644
--- a/ecc/bls12-381/g1_test.go
+++ b/ecc/bls12-381/g1_test.go
@@ -19,6 +19,7 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls12-381/g2_test.go b/ecc/bls12-381/g2_test.go
index be4957738e..27cccc9938 100644
--- a/ecc/bls12-381/g2_test.go
+++ b/ecc/bls12-381/g2_test.go
@@ -19,6 +19,7 @@ package bls12381
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls12-381/internal/fptower"
@@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-315/g1_test.go b/ecc/bls24-315/g1_test.go
index 4ffe3679c7..27054cb2fa 100644
--- a/ecc/bls24-315/g1_test.go
+++ b/ecc/bls24-315/g1_test.go
@@ -19,6 +19,7 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-315/g2_test.go b/ecc/bls24-315/g2_test.go
index 019fa5ec24..d923c5a007 100644
--- a/ecc/bls24-315/g2_test.go
+++ b/ecc/bls24-315/g2_test.go
@@ -19,6 +19,7 @@ package bls24315
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-315/internal/fptower"
@@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-317/g1_test.go b/ecc/bls24-317/g1_test.go
index 3a89f924e5..fc26138b40 100644
--- a/ecc/bls24-317/g1_test.go
+++ b/ecc/bls24-317/g1_test.go
@@ -19,6 +19,7 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bls24-317/g2_test.go b/ecc/bls24-317/g2_test.go
index 1d7ed1f3ff..8068d382f0 100644
--- a/ecc/bls24-317/g2_test.go
+++ b/ecc/bls24-317/g2_test.go
@@ -19,6 +19,7 @@ package bls24317
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bls24-317/internal/fptower"
@@ -505,32 +506,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bn254/g1_test.go b/ecc/bn254/g1_test.go
index 2e1973a911..c62747066a 100644
--- a/ecc/bn254/g1_test.go
+++ b/ecc/bn254/g1_test.go
@@ -19,6 +19,7 @@ package bn254
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
@@ -460,32 +461,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bn254/g2_test.go b/ecc/bn254/g2_test.go
index ae107fea78..5b103f2c4f 100644
--- a/ecc/bn254/g2_test.go
+++ b/ecc/bn254/g2_test.go
@@ -19,6 +19,7 @@ package bn254
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bn254/internal/fptower"
@@ -504,32 +505,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-633/g1_test.go b/ecc/bw6-633/g1_test.go
index a2b6c273f6..afa183ba27 100644
--- a/ecc/bw6-633/g1_test.go
+++ b/ecc/bw6-633/g1_test.go
@@ -19,6 +19,7 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-633/g2_test.go b/ecc/bw6-633/g2_test.go
index f5c4d5edca..cb8469886c 100644
--- a/ecc/bw6-633/g2_test.go
+++ b/ecc/bw6-633/g2_test.go
@@ -19,6 +19,7 @@ package bw6633
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-633/fp"
@@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-756/g1_test.go b/ecc/bw6-756/g1_test.go
index bd7a65f693..729fc7dfd6 100644
--- a/ecc/bw6-756/g1_test.go
+++ b/ecc/bw6-756/g1_test.go
@@ -19,6 +19,7 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-756/g2_test.go b/ecc/bw6-756/g2_test.go
index 7d98c06668..95bd4e2312 100644
--- a/ecc/bw6-756/g2_test.go
+++ b/ecc/bw6-756/g2_test.go
@@ -19,6 +19,7 @@ package bw6756
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-756/fp"
@@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-761/g1_test.go b/ecc/bw6-761/g1_test.go
index 4cbc725f60..f679f9719c 100644
--- a/ecc/bw6-761/g1_test.go
+++ b/ecc/bw6-761/g1_test.go
@@ -19,6 +19,7 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -499,32 +500,32 @@ func BenchmarkG1JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG1Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G1Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G1Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG1Affine(b *testing.B) {
 
-// 	fillBenchBasesG1(P[:])
-// 	fillBenchBasesG1(R[:])
+	var P, R pG1AffineC16
+	var RR ppG1AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG1(P[:])
+	fillBenchBasesG1(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG1Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG1Affine[pG1AffineC16, ppG1AffineC16, cG1AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG1AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/ecc/bw6-761/g2_test.go b/ecc/bw6-761/g2_test.go
index 7fa415d6a5..a0bd87be5e 100644
--- a/ecc/bw6-761/g2_test.go
+++ b/ecc/bw6-761/g2_test.go
@@ -19,6 +19,7 @@ package bw6761
 import (
 	"fmt"
 	"math/big"
+	"math/rand"
 	"testing"
 
 	"github.com/consensys/gnark-crypto/ecc/bw6-761/fp"
@@ -486,32 +487,32 @@ func BenchmarkG2JacIsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAddG2Affine(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]G2Affine
-// 	var RR, PP [MAX_BATCH_SIZE]*G2Affine
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAddG2Affine(b *testing.B) {
 
-// 	fillBenchBasesG2(P[:])
-// 	fillBenchBasesG2(R[:])
+	var P, R pG2AffineC16
+	var RR ppG2AffineC16
+	ridx := make([]int, len(P))
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
+	// TODO P == R may produce skewed benches
+	fillBenchBasesG2(P[:])
+	fillBenchBasesG2(R[:])
 
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i := 0; i < len(ridx); i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAddG2Affine(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAddG2Affine[pG2AffineC16, ppG2AffineC16, cG2AffineC16](&RR, &P, len(P))
+	}
+}
 
 func BenchmarkG2AffineBatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled
diff --git a/internal/generator/ecc/template/tests/point.go.tmpl b/internal/generator/ecc/template/tests/point.go.tmpl
index 223bfbe040..1f342412b8 100644
--- a/internal/generator/ecc/template/tests/point.go.tmpl
+++ b/internal/generator/ecc/template/tests/point.go.tmpl
@@ -16,6 +16,7 @@ import (
 	"fmt"
 	"math/big"
 	"testing"
+	"math/rand"
 
 	{{if or (eq .CoordType "fptower.E2") (eq .CoordType "fptower.E4")}}
 	"github.com/consensys/gnark-crypto/ecc/{{.Name}}/internal/fptower"
@@ -559,32 +560,32 @@ func Benchmark{{ $TJacobian }}IsInSubGroup(b *testing.B) {
 
 }
 
-// func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
-// 	var P, R [MAX_BATCH_SIZE]{{ $TAffine }}
-// 	var RR, PP [MAX_BATCH_SIZE]*{{ $TAffine }}
-// 	var ridx [MAX_BATCH_SIZE]int
+func BenchmarkBatchAdd{{ $TAffine }}(b *testing.B) {
+	{{$c := 16}}
+	var P, R p{{$TAffine}}C{{$c}}
+	var RR pp{{$TAffine}}C{{$c}}
+	ridx := make([]int, len(P))
 
-// 	fillBenchBases{{ toUpper $.PointName }}(P[:])
-// 	fillBenchBases{{ toUpper $.PointName }}(R[:])
+	// TODO P == R may produce skewed benches
+	fillBenchBases{{ toUpper $.PointName }}(P[:])
+	fillBenchBases{{ toUpper $.PointName }}(R[:])
 
-// 	for i:=0; i < len(ridx);i++ {
-// 		ridx[i] = i
-// 	}
-
-// 	// random permute
-// 	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
+	for i:=0; i < len(ridx);i++ {
+		ridx[i] = i
+	}
 
-// 	for i, ri := range ridx {
-// 		RR[i] = &R[ri]
-// 		PP[i] = &P[ri]
-// 	}
+	// random permute
+	rand.Shuffle(len(ridx), func(i, j int) { ridx[i], ridx[j] = ridx[j], ridx[i] })
 
-// 	b.ResetTimer()
-// 	for i := 0; i < b.N; i++ {
-// 		batchAdd{{ $TAffine }}(RR[:], PP[:], MAX_BATCH_SIZE / 2, MAX_BATCH_SIZE / 2)
-// 	}
+	for i, ri := range ridx {
+		RR[i] = &R[ri]
+	}
 
-// }
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchAdd{{ $TAffine }}[p{{$TAffine}}C{{$c}}, pp{{$TAffine}}C{{$c}}, c{{$TAffine}}C{{$c}}](&RR, &P, len(P))
+	}
+}
 
 func Benchmark{{ $TAffine }}BatchScalarMultiplication(b *testing.B) {
 	// ensure every words of the scalars are filled

From 6049e2ff393a914d1ba1e3377c9a5b200d3c3835 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 15:13:05 -0600
Subject: [PATCH 35/43] bug: bug when c==1 msm ext jac incorrect

---
 ecc/bls12-377/multiexp.go                              | 4 ++++
 ecc/bls12-377/multiexp_test.go                         | 6 +++++-
 ecc/bls12-378/multiexp.go                              | 4 ++++
 ecc/bls12-378/multiexp_test.go                         | 4 ++++
 ecc/bls12-381/multiexp.go                              | 4 ++++
 ecc/bls12-381/multiexp_test.go                         | 6 +++++-
 ecc/bls24-315/multiexp.go                              | 4 ++++
 ecc/bls24-315/multiexp_test.go                         | 6 +++++-
 ecc/bls24-317/multiexp.go                              | 4 ++++
 ecc/bls24-317/multiexp_test.go                         | 6 +++++-
 ecc/bn254/multiexp.go                                  | 4 ++++
 ecc/bn254/multiexp_test.go                             | 4 ++++
 ecc/bw6-633/multiexp.go                                | 4 ++++
 ecc/bw6-633/multiexp_test.go                           | 6 +++++-
 ecc/bw6-756/multiexp.go                                | 4 ++++
 ecc/bw6-756/multiexp_test.go                           | 4 ++++
 ecc/bw6-761/multiexp.go                                | 4 ++++
 ecc/bw6-761/multiexp_test.go                           | 4 ++++
 internal/generator/ecc/template/multiexp.go.tmpl       | 2 ++
 internal/generator/ecc/template/tests/multiexp.go.tmpl | 4 +++-
 20 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 7e519245cc..00f3a97050 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 4510933ea1..1eb820de13 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 63476d87c9..133242f8af 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 5bef450dc0..ab0f36f2ec 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 4a389f9e4f..60e4686759 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index fdd7809b56..aa5814cf6e 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 9b3b5eb0d8..85c2a14d17 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index bdb2e0167a..9787f4f172 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 6b98c52875..733358396c 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 48420037d0..54fbbb031f 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 791fc0c19e..d373d1683e 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -275,12 +275,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -534,12 +536,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 00cd01348e..a6cf0fbe50 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index eeb7d4e43a..dbbd344b08 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -222,12 +222,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -428,12 +430,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 6572865155..8e5c73a1e6 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{1, 4, 5, 8, 12, 16}
+	cRange := []uint64{4, 5, 8, 12, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 35135f2959..83b43d9a33 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -222,12 +222,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -428,12 +430,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index f6d3a94ca9..3daeb64fca 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 5037c2e0be..a0d2a19620 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -224,12 +224,14 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -432,12 +434,14 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 83f5b41bbd..8520cc7435 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -131,6 +131,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -170,6 +171,7 @@ func TestMultiExpG1(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -449,6 +451,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
@@ -488,6 +491,7 @@ func TestMultiExpG2(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
 					return false
 				}
 			}
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 13e0171a3c..5f3d40e9b5 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -541,12 +541,14 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan
     _p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
+			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
+	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 5079d33d3b..08c7acd6ff 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -101,7 +101,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	{{- if eq $.PointName "g1" }}
 	cRange := []uint64{
-		{{- range $c :=  $.CRange}}{{$c}},{{- end}}
+		{{- range $c :=  $.CRange}}{{- if gt $c 1}}{{$c}},{{- end}}{{- end}}
 	}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
@@ -139,6 +139,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 			for i:=1; i < len(results);i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i])
 					return false
 				}
 			}
@@ -178,6 +179,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 			}
 			for i := 1; i < len(results); i++ {
 				if !results[i].Equal(&results[i-1]) {
+					t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i])
 					return false
 				}
 			}

From 3133efdc7178e9f8f898a4a268d82c65f39e7387 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 15:33:08 -0600
Subject: [PATCH 36/43] test: added cross msm tests

---
 ecc/bls12-377/multiexp_test.go                | 210 ++++++++++++++++--
 ecc/bls12-378/multiexp_test.go                | 210 ++++++++++++++++--
 ecc/bls12-381/multiexp_test.go                | 210 ++++++++++++++++--
 ecc/bls24-315/multiexp_test.go                | 210 ++++++++++++++++--
 ecc/bls24-317/multiexp_test.go                | 210 ++++++++++++++++--
 ecc/bn254/multiexp_test.go                    | 210 ++++++++++++++++--
 ecc/bw6-633/multiexp_test.go                  | 210 ++++++++++++++++--
 ecc/bw6-756/multiexp_test.go                  | 210 ++++++++++++++++--
 ecc/bw6-761/multiexp_test.go                  | 210 ++++++++++++++++--
 .../ecc/template/tests/multiexp.go.tmpl       | 140 ++++++++++--
 10 files changed, 1885 insertions(+), 145 deletions(-)

diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index 1eb820de13..f2487e2edc 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index ab0f36f2ec..55524da71e 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index aa5814cf6e..8d96b5c59e 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 9787f4f172..4e67c67761 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 54fbbb031f..33e7c834c5 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index a6cf0fbe50..3307840f6a 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 8e5c73a1e6..dc7ef60c2c 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{4, 5, 8, 12, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 3daeb64fca..308efca4c9 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{3, 4, 5, 8, 11, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 8520cc7435..98560005bc 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -144,7 +144,6 @@ func TestMultiExpG1(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G1Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G1Jac
 
@@ -160,18 +159,36 @@ func TestMultiExpG1(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G1Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG1(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G1] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G1Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -218,6 +235,81 @@ func TestMultiExpG1(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG1(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G1Affine
+	var g G1Jac
+	g.Set(&g1Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g1Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	cRange := []uint64{2, 3, 4, 5, 8, 10, 16}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+
+	results := make([]G1Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG1(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G1Jac
+	_innerMsmG1Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G1Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG1Reference always do ext jacobian with c == 16
+func _innerMsmG1Reference(p *G1Jac, points []G1Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G1Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g1JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g1JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG1Jacobian[bucketg1JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG1Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG1(b *testing.B) {
 
 	const (
@@ -464,7 +556,6 @@ func TestMultiExpG2(t *testing.T) {
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]G2Affine
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected G2Jac
 
@@ -480,18 +571,36 @@ func TestMultiExpG2(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i%10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]G2Jac, len(cRange))
 			for i, c := range cRange {
 				_innerMsmG2(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1], cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[G2] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+			results := make([]G2Jac, len(cRange))
+			for i, c := range cRange {
+				_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -538,6 +647,79 @@ func TestMultiExpG2(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+func TestCrossMultiExpG2(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]G2Affine
+	var g G2Jac
+	g.Set(&g2Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&g2Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+
+	results := make([]G2Jac, len(cRange))
+	for i, c := range cRange {
+		_innerMsmG2(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r G2Jac
+	_innerMsmG2Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got G2Affine
+	expected.FromJacobian(&r)
+
+	for i := 0; i < len(results); i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
+
+}
+
+// _innerMsmG2Reference always do ext jacobian with c == 16
+func _innerMsmG2Reference(p *G2Jac, points []G2Affine, scalars []fr.Element, config ecc.MultiExpConfig) *G2Jac {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan g2JacExtended, nbChunks)
+	for i := 0; i < len(chChunks); i++ {
+		chChunks[i] = make(chan g2JacExtended, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunkG2Jacobian[bucketg2JacExtendedC16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunkG2Affine(p, int(16), chChunks[:])
+}
+
 func BenchmarkMultiExpG2(b *testing.B) {
 
 	const (
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index 08c7acd6ff..d99a83a49f 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -29,7 +29,7 @@ import (
 
 {{define "multiexp" }}
 
-func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
+func TestMultiExp{{$.UPointName}}(t *testing.T) {
 
 	parameters := gopter.DefaultTestParameters()
 	if testing.Short() {
@@ -72,7 +72,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 
 
 	// ensure a multiexp that's splitted has the same result as a non-splitted one..
-	properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (c=16) should be consistent with splitted multiexp", prop.ForAll(
+	properties.Property("[{{ $.UPointName }}] Multi exponentation (c=16) should be consistent with splitted multiexp", prop.ForAll(
 		func(mixer fr.Element) bool {
 			var samplePointsLarge [nbSamples*13]{{ $.TAffine }}
 			for i:=0; i<13; i++ {
@@ -113,7 +113,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	cRange := []uint64{5, 16}
 	{{- end}}
 
-	properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] Multi exponentation (c in %v) should be consistent with sum of square", cRange), prop.ForAll(
+	properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) should be consistent with sum of square", cRange), prop.ForAll(
 		func(mixer fr.Element) bool {
 
 			var expected {{ $.TJacobian }}
@@ -148,11 +148,10 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 		genScalar,
 	))
 
-	properties.Property(fmt.Sprintf("[{{ toUpper $.PointName }}] Multi exponentation (c in %v) of points at infinity should output a point at infinity", cRange), prop.ForAll(
+	properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) of points at infinity should output a point at infinity", cRange), prop.ForAll(
 		func(mixer fr.Element) bool {
 
 			var samplePointsZero [nbSamples]{{ $.TAffine }}
-			copy(samplePointsZero[:], samplePoints[:])
 
 			var expected {{ $.TJacobian }}
 
@@ -168,18 +167,37 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 				sampleScalars[i-1].SetUint64(uint64(i)).
 					Mul(&sampleScalars[i-1], &mixer).
 					FromMont()
-				if i % 10 == 0 {
-					samplePointsZero[i].setInfinity()
-				}
+				samplePointsZero[i-1].setInfinity()
 			}
 
 			results := make([]{{ $.TJacobian }}, len(cRange))
 			for i, c := range cRange {
 				_innerMsm{{ $.UPointName }}(&results[i], c, samplePointsZero[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
 			}
-			for i := 1; i < len(results); i++ {
-				if !results[i].Equal(&results[i-1]) {
-					t.Logf("result for c=%d != c=%d", cRange[i-1],cRange[i])
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
+					return false
+				}
+			}
+			return true
+		},
+		genScalar,
+	))
+
+	properties.Property(fmt.Sprintf("[{{ $.UPointName }}] Multi exponentation (c in %v) with a vector of 0s as input should output a point at infinity", cRange), prop.ForAll(
+		func(mixer fr.Element) bool {
+			// mixer ensures that all the words of a fpElement are set
+			var sampleScalars [nbSamples]fr.Element
+
+
+			results := make([]{{ $.TJacobian }}, len(cRange))
+			for i, c := range cRange {
+				_innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+			}
+			for i := 0; i < len(results); i++ {
+				if !results[i].Z.IsZero() {
+					t.Logf("result for c=%d is not infinity", cRange[i])
 					return false
 				}
 			}
@@ -191,7 +209,7 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 
 	// note : this test is here as we expect to have a different multiExp than the above bucket method
 	// for small number of points
-	properties.Property("[{{ toUpper $.PointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
+	properties.Property("[{{ $.UPointName }}] Multi exponentation (<50points) should be consistent with sum of square", prop.ForAll(
 		func(mixer fr.Element) bool {
 
 			var g {{ $.TJacobian }}
@@ -228,10 +246,94 @@ func TestMultiExp{{toUpper $.PointName}}(t *testing.T) {
 	properties.TestingRun(t, gopter.ConsoleReporter(false))
 }
 
+
+func TestCrossMultiExp{{ $.UPointName }}(t *testing.T) {
+	const nbSamples = 1 << 14
+	// multi exp points
+	var samplePoints [nbSamples]{{ $.TAffine }}
+	var g {{ $.TJacobian }}
+	g.Set(&{{ toLower $.PointName }}Gen)
+	for i := 1; i <= nbSamples; i++ {
+		samplePoints[i-1].FromJacobian(&g)
+		g.AddAssign(&{{ toLower $.PointName }}Gen)
+	}
+
+	// sprinkle some points at infinity
+	rand.Seed(time.Now().UnixNano())
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+	samplePoints[rand.Intn(nbSamples)].setInfinity()
+
+	var sampleScalars [nbSamples]fr.Element
+	fillBenchScalars(sampleScalars[:])
+
+	// cRange is generated from template and contains the available parameters for the multiexp window size
+	{{- if eq $.PointName "g1" }}
+	cRange := []uint64{
+		{{- range $c :=  $.CRange}}{{- if gt $c 1}}{{$c}},{{- end}}{{- end}}
+	}
+	if testing.Short() {
+		// test only "odd" and "even" (ie windows size divide word size vs not)
+		cRange = []uint64{5, 16}
+	}
+	{{- else }}
+	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
+	// test only "odd" and "even" (ie windows size divide word size vs not)
+	cRange := []uint64{5, 16}
+	{{- end}}
+
+	results := make([]{{ $.TJacobian }}, len(cRange))
+	for i, c := range cRange {
+		_innerMsm{{ $.UPointName }}(&results[i], c, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+	}
+
+	var r {{ $.TJacobian }}
+	_innerMsm{{ $.UPointName }}Reference(&r, samplePoints[:], sampleScalars[:], ecc.MultiExpConfig{NbTasks: runtime.NumCPU()})
+
+	var expected, got {{ $.TAffine}}
+	expected.FromJacobian(&r)
+
+	for i:=0; i<len(results);i++ {
+		got.FromJacobian(&results[i])
+		if !expected.Equal(&got) {
+			t.Fatalf("cross msm failed with c=%d", cRange[i])
+		}
+	}
 	
+}
+
+
+
+// _innerMsm{{ $.UPointName }}Reference always do ext jacobian with c == 16
+func _innerMsm{{ $.UPointName }}Reference(p *{{ $.TJacobian }}, points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) *{{ $.TJacobian }} {
+	// partition the scalars
+	digits, _ := partitionScalars(scalars, 16, config.ScalarsMont, config.NbTasks)
+
+	nbChunks := computeNbChunks(16)
+
+	// for each chunk, spawn one go routine that'll loop through all the scalars in the
+	// corresponding bit-window
+	// note that buckets is an array allocated on the stack and this is critical for performance
+
+	// each go routine sends its result in chChunks[i] channel
+	chChunks := make([]chan {{ $.TJacobianExtended }}, nbChunks)
+	for i:=0; i < len(chChunks);i++ {
+		chChunks[i] = make(chan {{ $.TJacobianExtended }}, 1)
+	}
+
+	// the last chunk may be processed with a different method than the rest, as it could be smaller.
+	n := len(points)
+	for j := int(nbChunks - 1); j >= 0; j-- {
+		processChunk := processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C16]
+		go processChunk(uint64(j), chChunks[j], 16, points, digits[j*n:(j+1)*n])
+	}
+
+	return msmReduceChunk{{ $.TAffine }}(p, int(16), chChunks[:])
+}
 
 
-func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
+func BenchmarkMultiExp{{ $.UPointName }}(b *testing.B) {
 
 	const (
 		pow = (bits.UintSize / 2) - (bits.UintSize / 8) // 24 on 64 bits arch, 12 on 32 bits
@@ -266,7 +368,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 		}
 	}
 
-	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
+	fillBenchBases{{ $.UPointName }}(samplePoints[:])
 
 
 	var testPoint {{ $.TAffine }}
@@ -298,7 +400,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}(b *testing.B) {
 }
 
 
-func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
+func BenchmarkMultiExp{{ $.UPointName }}Reference(b *testing.B) {
 	const nbSamples = 1 << 20
 
 	var (
@@ -307,7 +409,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
+	fillBenchBases{{ $.UPointName }}(samplePoints[:])
 
 	var testPoint {{ $.TAffine }}
 
@@ -318,7 +420,7 @@ func BenchmarkMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 }
 
 
-func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
+func BenchmarkManyMultiExp{{ $.UPointName }}Reference(b *testing.B) {
 	const nbSamples = 1 << 20
 
 	var (
@@ -327,7 +429,7 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 	)
 
 	fillBenchScalars(sampleScalars[:])
-	fillBenchBases{{ toUpper $.PointName }}(samplePoints[:])
+	fillBenchBases{{ $.UPointName }}(samplePoints[:])
 
 
 	var t1, t2, t3 {{ $.TAffine }}
@@ -357,7 +459,7 @@ func BenchmarkManyMultiExp{{ toUpper $.PointName }}Reference(b *testing.B) {
 // Rationale for generating points that are not on the curve is that for large benchmarks, generating
 // a vector of different points can take minutes. Using the same point or subset will bias the benchmark result
 // since bucket additions in extended jacobian coordinates will hit doubling algorithm instead of add.
-func fillBenchBases{{ toUpper $.PointName }}(samplePoints []{{ $.TAffine }}) {
+func fillBenchBases{{ $.UPointName }}(samplePoints []{{ $.TAffine }}) {
 	var r big.Int
 	r.SetString("340444420969191673093399857471996460938405", 10)
 	samplePoints[0].ScalarMultiplication(&samplePoints[0], &r)

From 37ae24ef7e3861ef04c33d31361ac9dfecf2daad Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 15:39:25 -0600
Subject: [PATCH 37/43] style: make staticcheck happier by code generating
 bucket sizes in clear

---
 ecc/bls12-377/multiexp_affine.go              | 58 +++++++++---------
 ecc/bls12-377/multiexp_jacobian.go            | 60 +++++++++----------
 ecc/bls12-378/multiexp_affine.go              | 58 +++++++++---------
 ecc/bls12-378/multiexp_jacobian.go            | 60 +++++++++----------
 ecc/bls12-381/multiexp_affine.go              | 58 +++++++++---------
 ecc/bls12-381/multiexp_jacobian.go            | 60 +++++++++----------
 ecc/bls24-315/multiexp_affine.go              | 58 +++++++++---------
 ecc/bls24-315/multiexp_jacobian.go            | 60 +++++++++----------
 ecc/bls24-317/multiexp_affine.go              | 58 +++++++++---------
 ecc/bls24-317/multiexp_jacobian.go            | 60 +++++++++----------
 ecc/bn254/multiexp_affine.go                  | 58 +++++++++---------
 ecc/bn254/multiexp_jacobian.go                | 60 +++++++++----------
 ecc/bw6-633/multiexp_affine.go                | 20 +++----
 ecc/bw6-633/multiexp_jacobian.go              | 24 ++++----
 ecc/bw6-756/multiexp_affine.go                | 20 +++----
 ecc/bw6-756/multiexp_jacobian.go              | 24 ++++----
 ecc/bw6-761/multiexp_affine.go                | 22 +++----
 ecc/bw6-761/multiexp_jacobian.go              | 28 ++++-----
 internal/generator/ecc/generate.go            |  4 ++
 .../ecc/template/multiexp_affine.go.tmpl      |  4 +-
 .../ecc/template/multiexp_jacobian.go.tmpl    |  2 +-
 21 files changed, 430 insertions(+), 426 deletions(-)

diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 83ce91c32a..331f283ede 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1 << (1 - 1)]bool
-type bitSetC2 [1 << (2 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC1 [1]bool
+type bitSetC2 [2]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC1 |
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index 8fd4e382ff..f34d5ff332 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1]g1JacExtended
+type bucketg1JacExtendedC2 [2]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1]g2JacExtended
+type bucketg2JacExtendedC2 [2]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index d8b54b76ca..0f65e1838d 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC2 [1 << (2 - 1)]bool
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC2 [2]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC2 |
diff --git a/ecc/bls12-378/multiexp_jacobian.go b/ecc/bls12-378/multiexp_jacobian.go
index eb83e3c1c2..0e9d572e73 100644
--- a/ecc/bls12-378/multiexp_jacobian.go
+++ b/ecc/bls12-378/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [2]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC2 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [2]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC2 |
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index bfc282b553..c566026823 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1 << (1 - 1)]bool
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC1 [1]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC1 |
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index bc304041f6..55cdd766b5 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 4e679fea95..4bf2f7f50c 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1 << (1 - 1)]bool
-type bitSetC2 [1 << (2 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC1 [1]bool
+type bitSetC2 [2]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC1 |
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 23310862df..0cd3432dad 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1]g1JacExtended
+type bucketg1JacExtendedC2 [2]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1]g2JacExtended
+type bucketg2JacExtendedC2 [2]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index f657bf2bcf..5cc17e3cc8 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1 << (1 - 1)]bool
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC1 [1]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC1 |
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index 973219cc4b..6ecfd659e9 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index 1f132b885e..d38581dbb6 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -235,13 +235,13 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC13 [1 << (13 - 1)]G1Affine
-type bucketG1AffineC14 [1 << (14 - 1)]G1Affine
-type bucketG1AffineC15 [1 << (15 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC13 [4096]G1Affine
+type bucketG1AffineC14 [8192]G1Affine
+type bucketG1AffineC15 [16384]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -554,13 +554,13 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC13 [1 << (13 - 1)]G2Affine
-type bucketG2AffineC14 [1 << (14 - 1)]G2Affine
-type bucketG2AffineC15 [1 << (15 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC13 [4096]G2Affine
+type bucketG2AffineC14 [8192]G2Affine
+type bucketG2AffineC15 [16384]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -659,21 +659,21 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC2 [1 << (2 - 1)]bool
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC6 [1 << (6 - 1)]bool
-type bitSetC7 [1 << (7 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC9 [1 << (9 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC13 [1 << (13 - 1)]bool
-type bitSetC14 [1 << (14 - 1)]bool
-type bitSetC15 [1 << (15 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC2 [2]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC6 [32]bool
+type bitSetC7 [64]bool
+type bitSetC8 [128]bool
+type bitSetC9 [256]bool
+type bitSetC10 [512]bool
+type bitSetC11 [1024]bool
+type bitSetC12 [2048]bool
+type bitSetC13 [4096]bool
+type bitSetC14 [8192]bool
+type bitSetC15 [16384]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC2 |
diff --git a/ecc/bn254/multiexp_jacobian.go b/ecc/bn254/multiexp_jacobian.go
index a674d4f724..0bd2482a98 100644
--- a/ecc/bn254/multiexp_jacobian.go
+++ b/ecc/bn254/multiexp_jacobian.go
@@ -61,21 +61,21 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC6 [1 << (6 - 1)]g1JacExtended
-type bucketg1JacExtendedC7 [1 << (7 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC9 [1 << (9 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC13 [1 << (13 - 1)]g1JacExtended
-type bucketg1JacExtendedC14 [1 << (14 - 1)]g1JacExtended
-type bucketg1JacExtendedC15 [1 << (15 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [2]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
+type bucketg1JacExtendedC7 [64]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC9 [256]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC13 [4096]g1JacExtended
+type bucketg1JacExtendedC14 [8192]g1JacExtended
+type bucketg1JacExtendedC15 [16384]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC2 |
@@ -140,21 +140,21 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC6 [1 << (6 - 1)]g2JacExtended
-type bucketg2JacExtendedC7 [1 << (7 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC9 [1 << (9 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC13 [1 << (13 - 1)]g2JacExtended
-type bucketg2JacExtendedC14 [1 << (14 - 1)]g2JacExtended
-type bucketg2JacExtendedC15 [1 << (15 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [2]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
+type bucketg2JacExtendedC7 [64]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC9 [256]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC13 [4096]g2JacExtended
+type bucketg2JacExtendedC14 [8192]g2JacExtended
+type bucketg2JacExtendedC15 [16384]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC2 |
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index 949a53f642..d05f2ce04f 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC12 [1 << (12 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC12 [2048]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC12 [1 << (12 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC12 [2048]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -538,12 +538,12 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1 << (1 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC12 [1 << (12 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC1 [1]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC8 [128]bool
+type bitSetC12 [2048]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC1 |
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index 497f2697fb..eb4a8a2a02 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -61,12 +61,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1 << (1 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC12 [1 << (12 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC1 [1]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC12 [2048]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC1 |
@@ -122,12 +122,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1 << (1 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC12 [1 << (12 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC1 [1]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC12 [2048]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC1 |
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 83cd6d1d61..968db46e6f 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC11 [1 << (11 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC11 [1024]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC11 [1 << (11 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC11 [1024]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -538,12 +538,12 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC11 [1 << (11 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC8 [128]bool
+type bitSetC11 [1024]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC3 |
diff --git a/ecc/bw6-756/multiexp_jacobian.go b/ecc/bw6-756/multiexp_jacobian.go
index 93fd87fe51..cd15044132 100644
--- a/ecc/bw6-756/multiexp_jacobian.go
+++ b/ecc/bw6-756/multiexp_jacobian.go
@@ -61,12 +61,12 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC11 [1 << (11 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC11 [1024]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC3 |
@@ -122,12 +122,12 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC11 [1 << (11 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC11 [1024]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC3 |
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index bfeea763cb..91750cd328 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -234,8 +234,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG1AffineC10 [1 << (10 - 1)]G1Affine
-type bucketG1AffineC16 [1 << (16 - 1)]G1Affine
+type bucketG1AffineC10 [512]G1Affine
+type bucketG1AffineC16 [32768]G1Affine
 
 // buckets: array of G1Affine points of size 1 << (c-1)
 type ibG1Affine interface {
@@ -493,8 +493,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketG2AffineC10 [1 << (10 - 1)]G2Affine
-type bucketG2AffineC16 [1 << (16 - 1)]G2Affine
+type bucketG2AffineC10 [512]G2Affine
+type bucketG2AffineC16 [32768]G2Affine
 
 // buckets: array of G2Affine points of size 1 << (c-1)
 type ibG2Affine interface {
@@ -538,13 +538,13 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC2 [1 << (2 - 1)]bool
-type bitSetC3 [1 << (3 - 1)]bool
-type bitSetC4 [1 << (4 - 1)]bool
-type bitSetC5 [1 << (5 - 1)]bool
-type bitSetC8 [1 << (8 - 1)]bool
-type bitSetC10 [1 << (10 - 1)]bool
-type bitSetC16 [1 << (16 - 1)]bool
+type bitSetC2 [2]bool
+type bitSetC3 [4]bool
+type bitSetC4 [8]bool
+type bitSetC5 [16]bool
+type bitSetC8 [128]bool
+type bitSetC10 [512]bool
+type bitSetC16 [32768]bool
 
 type bitSet interface {
 	bitSetC2 |
diff --git a/ecc/bw6-761/multiexp_jacobian.go b/ecc/bw6-761/multiexp_jacobian.go
index 59edd2d1bd..ca6b1610da 100644
--- a/ecc/bw6-761/multiexp_jacobian.go
+++ b/ecc/bw6-761/multiexp_jacobian.go
@@ -61,13 +61,13 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC2 [1 << (2 - 1)]g1JacExtended
-type bucketg1JacExtendedC3 [1 << (3 - 1)]g1JacExtended
-type bucketg1JacExtendedC4 [1 << (4 - 1)]g1JacExtended
-type bucketg1JacExtendedC5 [1 << (5 - 1)]g1JacExtended
-type bucketg1JacExtendedC8 [1 << (8 - 1)]g1JacExtended
-type bucketg1JacExtendedC10 [1 << (10 - 1)]g1JacExtended
-type bucketg1JacExtendedC16 [1 << (16 - 1)]g1JacExtended
+type bucketg1JacExtendedC2 [2]g1JacExtended
+type bucketg1JacExtendedC3 [4]g1JacExtended
+type bucketg1JacExtendedC4 [8]g1JacExtended
+type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC8 [128]g1JacExtended
+type bucketg1JacExtendedC10 [512]g1JacExtended
+type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC2 |
@@ -124,13 +124,13 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC2 [1 << (2 - 1)]g2JacExtended
-type bucketg2JacExtendedC3 [1 << (3 - 1)]g2JacExtended
-type bucketg2JacExtendedC4 [1 << (4 - 1)]g2JacExtended
-type bucketg2JacExtendedC5 [1 << (5 - 1)]g2JacExtended
-type bucketg2JacExtendedC8 [1 << (8 - 1)]g2JacExtended
-type bucketg2JacExtendedC10 [1 << (10 - 1)]g2JacExtended
-type bucketg2JacExtendedC16 [1 << (16 - 1)]g2JacExtended
+type bucketg2JacExtendedC2 [2]g2JacExtended
+type bucketg2JacExtendedC3 [4]g2JacExtended
+type bucketg2JacExtendedC4 [8]g2JacExtended
+type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC8 [128]g2JacExtended
+type bucketg2JacExtendedC10 [512]g2JacExtended
+type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC2 |
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 9af8b0dd15..2dce39800b 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -69,6 +69,10 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 	funcs["lastC"] = lastC
 	funcs["batchSize"] = batchSize
 
+	funcs["nbBuckets"] = func(c int) int {
+		return 1 << (c - 1)
+	}
+
 	funcs["contains"] = func(v int, s []int) bool {
 		for _, sv := range s {
 			if v == sv {
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index a3c609910f..23baf3d16d 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -239,7 +239,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 // this allow us to allocate the buckets on the stack
 {{- range $c :=  $.CRange}}
 {{- if gt $c 9}}
-type bucket{{ $.TAffine }}C{{$c}} [1<<({{$c}}-1)]{{ $.TAffine }}
+type bucket{{ $.TAffine }}C{{$c}} [{{nbBuckets $c}}]{{ $.TAffine }}
 {{- end}}
 {{- end}}
 
@@ -304,7 +304,7 @@ type q{{ $.TAffine }}C{{$c}} [{{batchSize $c}}]batchOp{{ $.TAffine }}
 {{end }}
 
 {{- range $c :=  $.G1.CRange}}
-type bitSetC{{$c}} [1<<({{$c}}-1)]bool
+type bitSetC{{$c}} [{{nbBuckets $c}}]bool
 {{- end}}
 
 type bitSet interface {
diff --git a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
index 3fd44311bc..166d185faa 100644
--- a/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_jacobian.go.tmpl
@@ -64,7 +64,7 @@ func processChunk{{ $.UPointName }}Jacobian[B ib{{ $.TJacobianExtended }}](chunk
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
 {{- range $c :=  $.CRange}}
-type bucket{{ $.TJacobianExtended }}C{{$c}} [1<<({{$c}}-1)]{{ $.TJacobianExtended }}
+type bucket{{ $.TJacobianExtended }}C{{$c}} [{{nbBuckets $c}}]{{ $.TJacobianExtended }}
 {{- end}}
 
 type ib{{ $.TJacobianExtended }} interface {

From 0eb6955ebd0acb5c1ee4a8c1f147918247d4a36d Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 21:23:13 -0600
Subject: [PATCH 38/43] feat: deal with doubling edge case using other set of
 buckets

---
 ecc/bls12-377/multiexp_affine.go              | 22 +++++++------------
 ecc/bls12-378/multiexp_affine.go              | 22 +++++++------------
 ecc/bls12-381/multiexp_affine.go              | 22 +++++++------------
 ecc/bls24-315/multiexp_affine.go              | 22 +++++++------------
 ecc/bls24-317/multiexp_affine.go              | 22 +++++++------------
 ecc/bn254/multiexp_affine.go                  | 22 +++++++------------
 ecc/bw6-633/multiexp_affine.go                | 22 +++++++------------
 ecc/bw6-756/multiexp_affine.go                | 22 +++++++------------
 ecc/bw6-761/multiexp_affine.go                | 22 +++++++------------
 .../ecc/template/multiexp_affine.go.tmpl      | 11 ++++------
 10 files changed, 76 insertions(+), 133 deletions(-)

diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index 331f283ede..df95b34370 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bls12-378/multiexp_affine.go b/ecc/bls12-378/multiexp_affine.go
index 0f65e1838d..28301102cf 100644
--- a/ecc/bls12-378/multiexp_affine.go
+++ b/ecc/bls12-378/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index c566026823..66f1e361a1 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 4bf2f7f50c..5db6512603 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 5cc17e3cc8..37050a251b 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bn254/multiexp_affine.go b/ecc/bn254/multiexp_affine.go
index d38581dbb6..c0fd33431e 100644
--- a/ecc/bn254/multiexp_affine.go
+++ b/ecc/bn254/multiexp_affine.go
@@ -97,9 +97,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -127,10 +126,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -139,7 +136,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -416,9 +413,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -446,10 +442,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -458,7 +452,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index d05f2ce04f..f7189e2c35 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bw6-756/multiexp_affine.go b/ecc/bw6-756/multiexp_affine.go
index 968db46e6f..086c2e9f83 100644
--- a/ecc/bw6-756/multiexp_affine.go
+++ b/ecc/bw6-756/multiexp_affine.go
@@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/ecc/bw6-761/multiexp_affine.go b/ecc/bw6-761/multiexp_affine.go
index 91750cd328..5f423838c4 100644
--- a/ecc/bw6-761/multiexp_affine.go
+++ b/ecc/bw6-761/multiexp_affine.go
@@ -96,9 +96,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -126,10 +125,8 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -138,7 +135,7 @@ func processChunkG1BatchAffine[BJE ibg1JacExtended, B ibG1Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
@@ -355,9 +352,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -385,10 +381,8 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()
 				}
@@ -397,7 +391,7 @@ func processChunkG2BatchAffine[BJE ibg2JacExtended, B ibG2Affine, BS bitSet, TP
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}
diff --git a/internal/generator/ecc/template/multiexp_affine.go.tmpl b/internal/generator/ecc/template/multiexp_affine.go.tmpl
index 23baf3d16d..979d05c00a 100644
--- a/internal/generator/ecc/template/multiexp_affine.go.tmpl
+++ b/internal/generator/ecc/template/multiexp_affine.go.tmpl
@@ -96,9 +96,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		if BK.X.Equal(&op.point.X) {
 			if BK.Y.Equal(&op.point.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
-				BK.Add(BK, BK)
+				// we use the other set of buckets
+				bucketsJE[op.bucketID].addMixed(&op.point)
 				return
 			}
 			BK.setInfinity()
@@ -126,10 +125,8 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 		if BK.X.Equal(&PP.X) {
 			if BK.Y.Equal(&PP.Y) {
 				// P + P: doubling, which should be quite rare --
-				// TODO FIXME @gbotrel / @yelhousni this path is not taken by our tests.
-				// need doubling in affine implemented ?
 				if isAdd {
-					BK.Add(BK, BK)
+					bucketsJE[bucketID].addMixed(PP)
 				} else {
 					BK.setInfinity()	
 				}
@@ -138,7 +135,7 @@ func processChunk{{ $.UPointName }}BatchAffine[BJE ib{{ $.TJacobianExtended }},B
 			if isAdd {
 				BK.setInfinity()
 			} else {
-				BK.Add(BK, BK)
+				bucketsJE[bucketID].subMixed(PP)
 			}
 			return
 		}

From c1ec769268b0cfd298e4280ebe1b961224b75066 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Wed, 16 Nov 2022 21:29:02 -0600
Subject: [PATCH 39/43] test: add some doublings in msm test

---
 ecc/bls12-377/multiexp_test.go                       | 12 ++++++++++++
 ecc/bls12-378/multiexp_test.go                       | 12 ++++++++++++
 ecc/bls12-381/multiexp_test.go                       | 12 ++++++++++++
 ecc/bls24-315/multiexp_test.go                       | 12 ++++++++++++
 ecc/bls24-317/multiexp_test.go                       | 12 ++++++++++++
 ecc/bn254/multiexp_test.go                           | 12 ++++++++++++
 ecc/bw6-633/multiexp_test.go                         | 12 ++++++++++++
 ecc/bw6-756/multiexp_test.go                         | 12 ++++++++++++
 ecc/bw6-761/multiexp_test.go                         | 12 ++++++++++++
 .../generator/ecc/template/tests/multiexp.go.tmpl    |  7 +++++++
 10 files changed, 115 insertions(+)

diff --git a/ecc/bls12-377/multiexp_test.go b/ecc/bls12-377/multiexp_test.go
index f2487e2edc..eaf9952de7 100644
--- a/ecc/bls12-377/multiexp_test.go
+++ b/ecc/bls12-377/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bls12-378/multiexp_test.go b/ecc/bls12-378/multiexp_test.go
index 55524da71e..7fcb05040a 100644
--- a/ecc/bls12-378/multiexp_test.go
+++ b/ecc/bls12-378/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bls12-381/multiexp_test.go b/ecc/bls12-381/multiexp_test.go
index 8d96b5c59e..4b013c619a 100644
--- a/ecc/bls12-381/multiexp_test.go
+++ b/ecc/bls12-381/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bls24-315/multiexp_test.go b/ecc/bls24-315/multiexp_test.go
index 4e67c67761..c86dfdad0d 100644
--- a/ecc/bls24-315/multiexp_test.go
+++ b/ecc/bls24-315/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bls24-317/multiexp_test.go b/ecc/bls24-317/multiexp_test.go
index 33e7c834c5..8287221bbd 100644
--- a/ecc/bls24-317/multiexp_test.go
+++ b/ecc/bls24-317/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bn254/multiexp_test.go b/ecc/bn254/multiexp_test.go
index 3307840f6a..e64b8c9c32 100644
--- a/ecc/bn254/multiexp_test.go
+++ b/ecc/bn254/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index dc7ef60c2c..242ccf1fc0 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{4, 5, 8, 12, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bw6-756/multiexp_test.go b/ecc/bw6-756/multiexp_test.go
index 308efca4c9..70643ca8f5 100644
--- a/ecc/bw6-756/multiexp_test.go
+++ b/ecc/bw6-756/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{3, 4, 5, 8, 11, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/ecc/bw6-761/multiexp_test.go b/ecc/bw6-761/multiexp_test.go
index 98560005bc..3532346ae9 100644
--- a/ecc/bw6-761/multiexp_test.go
+++ b/ecc/bw6-761/multiexp_test.go
@@ -256,6 +256,12 @@ func TestCrossMultiExpG1(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	cRange := []uint64{2, 3, 4, 5, 8, 10, 16}
 	if testing.Short() {
@@ -668,6 +674,12 @@ func TestCrossMultiExpG2(t *testing.T) {
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i := 10; i < 100; i++ {
+		samplePoints[i] = samplePoints[0]
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	// for g2, CI suffers with large c size since it needs to allocate a lot of memory for the buckets.
 	// test only "odd" and "even" (ie windows size divide word size vs not)
diff --git a/internal/generator/ecc/template/tests/multiexp.go.tmpl b/internal/generator/ecc/template/tests/multiexp.go.tmpl
index d99a83a49f..d929eb0334 100644
--- a/internal/generator/ecc/template/tests/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/tests/multiexp.go.tmpl
@@ -265,9 +265,16 @@ func TestCrossMultiExp{{ $.UPointName }}(t *testing.T) {
 	samplePoints[rand.Intn(nbSamples)].setInfinity()
 	samplePoints[rand.Intn(nbSamples)].setInfinity()
 
+
 	var sampleScalars [nbSamples]fr.Element
 	fillBenchScalars(sampleScalars[:])
 
+	// sprinkle some doublings
+	for i:=10; i < 100; i++ {
+		samplePoints[i]  = samplePoints[0] 
+		sampleScalars[i] = sampleScalars[0]
+	}
+
 	// cRange is generated from template and contains the available parameters for the multiexp window size
 	{{- if eq $.PointName "g1" }}
 	cRange := []uint64{

From 4dbc3643a0e565e0d83716f22aeeae238fea8d25 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Thu, 17 Nov 2022 13:09:17 -0600
Subject: [PATCH 40/43] fix: msm partitionScalar - handle edge cases with carry

---
 ecc/bls12-377/multiexp.go                     | 48 ++++++++++------
 ecc/bls12-377/multiexp_affine.go              |  4 +-
 ecc/bls12-377/multiexp_jacobian.go            |  8 +--
 ecc/bls12-378/multiexp.go                     | 44 ++++++++++-----
 ecc/bls12-381/multiexp.go                     | 48 ++++++++++------
 ecc/bls12-381/multiexp_affine.go              |  4 +-
 ecc/bls12-381/multiexp_jacobian.go            |  8 +--
 ecc/bls24-315/multiexp.go                     | 48 ++++++++++------
 ecc/bls24-315/multiexp_affine.go              |  4 +-
 ecc/bls24-315/multiexp_jacobian.go            |  8 +--
 ecc/bls24-317/multiexp.go                     | 48 ++++++++++------
 ecc/bls24-317/multiexp_affine.go              |  4 +-
 ecc/bls24-317/multiexp_jacobian.go            |  8 +--
 ecc/bn254/multiexp.go                         | 44 ++++++++++-----
 ecc/bw6-633/multiexp.go                       | 48 ++++++++++------
 ecc/bw6-633/multiexp_affine.go                |  4 +-
 ecc/bw6-633/multiexp_jacobian.go              |  8 +--
 ecc/bw6-756/multiexp.go                       | 44 ++++++++++-----
 ecc/bw6-761/multiexp.go                       | 44 ++++++++++-----
 internal/field/field.go                       | 18 +++++-
 internal/generator/ecc/generate.go            | 55 +++++++++++++------
 .../generator/ecc/template/multiexp.go.tmpl   | 44 ++++++++++-----
 22 files changed, 370 insertions(+), 223 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 00f3a97050..c81bd397ad 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 2:
 		return processChunkG1Jacobian[bucketg1JacExtendedC2]
 	case 4:
@@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 2:
 		return processChunkG2Jacobian[bucketg2JacExtendedC2]
 	case 4:
@@ -560,25 +556,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1001010101011011
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +608,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +642,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +659,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bls12-377/multiexp_affine.go b/ecc/bls12-377/multiexp_affine.go
index df95b34370..5aa3546b5e 100644
--- a/ecc/bls12-377/multiexp_affine.go
+++ b/ecc/bls12-377/multiexp_affine.go
@@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1]bool
 type bitSetC2 [2]bool
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
@@ -670,8 +669,7 @@ type bitSetC15 [16384]bool
 type bitSetC16 [32768]bool
 
 type bitSet interface {
-	bitSetC1 |
-		bitSetC2 |
+	bitSetC2 |
 		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
diff --git a/ecc/bls12-377/multiexp_jacobian.go b/ecc/bls12-377/multiexp_jacobian.go
index f34d5ff332..e01f5567f5 100644
--- a/ecc/bls12-377/multiexp_jacobian.go
+++ b/ecc/bls12-377/multiexp_jacobian.go
@@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1]g1JacExtended
 type bucketg1JacExtendedC2 [2]g1JacExtended
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
@@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
+	bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1]g2JacExtended
 type bucketg2JacExtendedC2 [2]g2JacExtended
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
@@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
+	bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 133242f8af..8e710a39c2 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -560,25 +560,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1000001110011110
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +612,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +646,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +663,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +681,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 60e4686759..173d99e6ee 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 3:
 		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
@@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 3:
 		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
@@ -560,25 +556,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1110011111011011
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +608,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +642,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +659,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bls12-381/multiexp_affine.go b/ecc/bls12-381/multiexp_affine.go
index 66f1e361a1..f2fcc05732 100644
--- a/ecc/bls12-381/multiexp_affine.go
+++ b/ecc/bls12-381/multiexp_affine.go
@@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1]bool
 type bitSetC3 [4]bool
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
@@ -670,8 +669,7 @@ type bitSetC15 [16384]bool
 type bitSetC16 [32768]bool
 
 type bitSet interface {
-	bitSetC1 |
-		bitSetC3 |
+	bitSetC3 |
 		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
diff --git a/ecc/bls12-381/multiexp_jacobian.go b/ecc/bls12-381/multiexp_jacobian.go
index 55cdd766b5..2a2f8caa85 100644
--- a/ecc/bls12-381/multiexp_jacobian.go
+++ b/ecc/bls12-381/multiexp_jacobian.go
@@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1]g1JacExtended
 type bucketg1JacExtendedC3 [4]g1JacExtended
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
@@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC3 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1]g2JacExtended
 type bucketg2JacExtendedC3 [4]g2JacExtended
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
@@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC3 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 85c2a14d17..488bdd5837 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 2:
 		return processChunkG1Jacobian[bucketg1JacExtendedC2]
 	case 4:
@@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 2:
 		return processChunkG2Jacobian[bucketg2JacExtendedC2]
 	case 4:
@@ -560,25 +556,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1100101101101111
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +608,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +642,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +659,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bls24-315/multiexp_affine.go b/ecc/bls24-315/multiexp_affine.go
index 5db6512603..40a45408fe 100644
--- a/ecc/bls24-315/multiexp_affine.go
+++ b/ecc/bls24-315/multiexp_affine.go
@@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1]bool
 type bitSetC2 [2]bool
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
@@ -670,8 +669,7 @@ type bitSetC15 [16384]bool
 type bitSetC16 [32768]bool
 
 type bitSet interface {
-	bitSetC1 |
-		bitSetC2 |
+	bitSetC2 |
 		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
diff --git a/ecc/bls24-315/multiexp_jacobian.go b/ecc/bls24-315/multiexp_jacobian.go
index 0cd3432dad..be0bb121b4 100644
--- a/ecc/bls24-315/multiexp_jacobian.go
+++ b/ecc/bls24-315/multiexp_jacobian.go
@@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1]g1JacExtended
 type bucketg1JacExtendedC2 [2]g1JacExtended
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
@@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC2 |
+	bucketg1JacExtendedC2 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1]g2JacExtended
 type bucketg2JacExtendedC2 [2]g2JacExtended
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
@@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC2 |
+	bucketg2JacExtendedC2 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index 733358396c..f5f31de3d4 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 3:
 		return processChunkG1Jacobian[bucketg1JacExtendedC3]
 	case 4:
@@ -444,8 +442,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 3:
 		return processChunkG2Jacobian[bucketg2JacExtendedC3]
 	case 4:
@@ -560,25 +556,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1000100001111111
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +608,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +642,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +659,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +677,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bls24-317/multiexp_affine.go b/ecc/bls24-317/multiexp_affine.go
index 37050a251b..803835d815 100644
--- a/ecc/bls24-317/multiexp_affine.go
+++ b/ecc/bls24-317/multiexp_affine.go
@@ -653,7 +653,6 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1]bool
 type bitSetC3 [4]bool
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
@@ -670,8 +669,7 @@ type bitSetC15 [16384]bool
 type bitSetC16 [32768]bool
 
 type bitSet interface {
-	bitSetC1 |
-		bitSetC3 |
+	bitSetC3 |
 		bitSetC4 |
 		bitSetC5 |
 		bitSetC6 |
diff --git a/ecc/bls24-317/multiexp_jacobian.go b/ecc/bls24-317/multiexp_jacobian.go
index 6ecfd659e9..15fbf46f0e 100644
--- a/ecc/bls24-317/multiexp_jacobian.go
+++ b/ecc/bls24-317/multiexp_jacobian.go
@@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1]g1JacExtended
 type bucketg1JacExtendedC3 [4]g1JacExtended
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
@@ -78,8 +77,7 @@ type bucketg1JacExtendedC15 [16384]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC3 |
+	bucketg1JacExtendedC3 |
 		bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC6 |
@@ -140,7 +138,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1]g2JacExtended
 type bucketg2JacExtendedC3 [4]g2JacExtended
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
@@ -157,8 +154,7 @@ type bucketg2JacExtendedC15 [16384]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC3 |
+	bucketg2JacExtendedC3 |
 		bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC6 |
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index d373d1683e..88a5e5a1e2 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -560,25 +560,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1100000110010001
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -610,7 +612,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -644,7 +646,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -661,7 +663,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -679,6 +681,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index dbbd344b08..e78cb54082 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -183,8 +183,6 @@ func _innerMsmG1(p *G1Jac, c uint64, points []G1Affine, scalars []fr.Element, co
 func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g1JacExtended, c uint64, points []G1Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG1Jacobian[bucketg1JacExtendedC1]
 	case 4:
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
@@ -391,8 +389,6 @@ func _innerMsmG2(p *G2Jac, c uint64, points []G2Affine, scalars []fr.Element, co
 func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes chan<- g2JacExtended, c uint64, points []G2Affine, digits []uint16) {
 	switch c {
 
-	case 1:
-		return processChunkG2Jacobian[bucketg2JacExtendedC1]
 	case 4:
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
@@ -454,25 +450,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1001100001000111
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -504,7 +502,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -538,7 +536,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -555,7 +553,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -573,6 +571,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index f7189e2c35..fc4e13a30f 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -532,7 +532,6 @@ type pG2AffineC16 [640]G2Affine
 type ppG2AffineC16 [640]*G2Affine
 type qG2AffineC16 [640]batchOpG2Affine
 
-type bitSetC1 [1]bool
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
 type bitSetC8 [128]bool
@@ -540,8 +539,7 @@ type bitSetC12 [2048]bool
 type bitSetC16 [32768]bool
 
 type bitSet interface {
-	bitSetC1 |
-		bitSetC4 |
+	bitSetC4 |
 		bitSetC5 |
 		bitSetC8 |
 		bitSetC12 |
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index eb4a8a2a02..7e44a83aab 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -61,7 +61,6 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg1JacExtendedC1 [1]g1JacExtended
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
 type bucketg1JacExtendedC8 [128]g1JacExtended
@@ -69,8 +68,7 @@ type bucketg1JacExtendedC12 [2048]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
 
 type ibg1JacExtended interface {
-	bucketg1JacExtendedC1 |
-		bucketg1JacExtendedC4 |
+	bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
 		bucketg1JacExtendedC8 |
 		bucketg1JacExtendedC12 |
@@ -122,7 +120,6 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 
 // we declare the buckets as fixed-size array types
 // this allow us to allocate the buckets on the stack
-type bucketg2JacExtendedC1 [1]g2JacExtended
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
 type bucketg2JacExtendedC8 [128]g2JacExtended
@@ -130,8 +127,7 @@ type bucketg2JacExtendedC12 [2048]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
 
 type ibg2JacExtended interface {
-	bucketg2JacExtendedC1 |
-		bucketg2JacExtendedC4 |
+	bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
 		bucketg2JacExtendedC8 |
 		bucketg2JacExtendedC12 |
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 83b43d9a33..55da8d2db1 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -454,25 +454,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1111101110101100
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -504,7 +506,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -538,7 +540,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -555,7 +557,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -573,6 +575,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index a0d2a19620..ba723c154a 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -458,25 +458,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits + 1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits + c - 1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b1101011100011101
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c + 1 - nbAvailableBits
 }
 
 type chunkStat struct {
@@ -508,7 +510,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask := uint64((1 << c) - 1) // low c bits are 1
-	max := int(1 << (c - 1))     // max value we want for our digits
+	max := int(1<<(c-1)) - 1     // max value (inclusive) we want for our digits
 	cDivides64 := (64 % c) == 0  // if c doesn't divide 64, we may need to select over multiple words
 
 	// compute offset and word selector / shift to select the right bits of our windows
@@ -542,7 +544,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks-1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -559,7 +561,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -577,6 +579,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1]&s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] = uint16(digit) << 1
 		}
 
 	}, nbTasks)
diff --git a/internal/field/field.go b/internal/field/field.go
index 714e09770b..34783417bf 100644
--- a/internal/field/field.go
+++ b/internal/field/field.go
@@ -18,13 +18,14 @@ package field
 import (
 	"errors"
 	"fmt"
-	"github.com/consensys/bavard"
-	"github.com/consensys/gnark-crypto/internal/field/internal/addchain"
 	"math"
 	"math/big"
 	"math/bits"
 	"strconv"
 	"strings"
+
+	"github.com/consensys/bavard"
+	"github.com/consensys/gnark-crypto/internal/field/internal/addchain"
 )
 
 var (
@@ -38,6 +39,7 @@ type FieldConfig struct {
 	ModulusBig                *big.Int
 	Modulus                   string
 	ModulusHex                string
+	ModulusSixteenMSB         uint64 // 16 most significant bits of the modulus, right-aligned.
 	NbWords                   int
 	NbBits                    int
 	NbWordsLastIndex          int
@@ -96,6 +98,18 @@ func NewFieldConfig(packageName, elementName, modulus string, useAddChain bool)
 	F.NbBits = bModulus.BitLen()
 	F.NbWords = len(bModulus.Bits())
 
+	// compute the 16 msb;
+	if F.NbBits <= 16 {
+		F.ModulusSixteenMSB = F.ModulusBig.Uint64()
+	} else {
+		msb := new(big.Int)
+		msb.Rsh(F.ModulusBig, uint(F.NbBits)-16)
+		if msb.BitLen() != 16 {
+			panic("sanity check.")
+		}
+		F.ModulusSixteenMSB = msb.Uint64()
+	}
+
 	F.NbWordsLastIndex = F.NbWords - 1
 
 	// set q from big int repr
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 2dce39800b..8283f912fd 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -28,12 +28,23 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 	funcs["last"] = func(x int, a interface{}) bool {
 		return x == reflect.ValueOf(a).Len()-1
 	}
+	funcs["binary"] = func(x uint64) string {
+		return strings.TrimSpace(fmt.Sprintf("%b", x))
+	}
 	lastC := func(c int) int {
-		n := (conf.Fr.NbBits + 1) // +1 for the potential carry of the NAF decomposition
-		if n%c == 0 {
-			return c
+		nbChunks := (conf.Fr.NbBits + c - 1) / c
+		nbAvailableBits := (nbChunks * c) - conf.Fr.NbBits
+		if nbAvailableBits == 0 {
+			// we can push a bit the edge case here;
+			// if the c-msb bits of modulus are not all ones, we have space for the carry
+			// (assuming inputs are smaller than modulus)
+			msb16 := conf.Fr.ModulusSixteenMSB
+			msbC := msb16 >> (16 - c)
+			if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+				nbAvailableBits++
+			}
 		}
-		return n - (c * (n / c))
+		return c + 1 - nbAvailableBits
 	}
 	batchSize := func(c int) int {
 		// nbBuckets := (1 << (c - 1))
@@ -82,24 +93,36 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 		return false
 	}
 	lastCG1 := make([]int, 0)
-	for i := 0; i < len(conf.G1.CRange); i++ {
-		lc := lastC(conf.G1.CRange[i])
-		if !contains(conf.G1.CRange, lc) && !contains(lastCG1, lc) {
-			lastCG1 = append(lastCG1, lc)
+	for {
+		for i := 0; i < len(conf.G1.CRange); i++ {
+			lc := lastC(conf.G1.CRange[i])
+			if !contains(conf.G1.CRange, lc) && !contains(lastCG1, lc) {
+				lastCG1 = append(lastCG1, lc)
+			}
 		}
+		if len(lastCG1) == 0 {
+			break
+		}
+		conf.G1.CRange = append(conf.G1.CRange, lastCG1...)
+		sort.Ints(conf.G1.CRange)
+		lastCG1 = lastCG1[:0]
 	}
-	conf.G1.CRange = append(conf.G1.CRange, lastCG1...)
-	sort.Ints(conf.G1.CRange)
 
 	lastCG2 := make([]int, 0)
-	for i := 0; i < len(conf.G2.CRange); i++ {
-		lc := lastC(conf.G2.CRange[i])
-		if !contains(conf.G2.CRange, lc) && !contains(lastCG2, lc) {
-			lastCG2 = append(lastCG2, lc)
+	for {
+		for i := 0; i < len(conf.G2.CRange); i++ {
+			lc := lastC(conf.G2.CRange[i])
+			if !contains(conf.G2.CRange, lc) && !contains(lastCG2, lc) {
+				lastCG2 = append(lastCG2, lc)
+			}
+		}
+		if len(lastCG2) == 0 {
+			break
 		}
+		conf.G2.CRange = append(conf.G2.CRange, lastCG2...)
+		sort.Ints(conf.G2.CRange)
+		lastCG2 = lastCG2[:0]
 	}
-	conf.G2.CRange = append(conf.G2.CRange, lastCG2...)
-	sort.Ints(conf.G2.CRange)
 
 	bavardOpts := []func(*bavard.Bavard) error{bavard.Funcs(funcs)}
 	if err := bgen.GenerateWithOptions(conf, packageName, "./ecc/template", bavardOpts, entries...); err != nil {
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 5f3d40e9b5..19ee187a84 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -33,25 +33,27 @@ type selector struct {
 }
 
 // return number of chunks for a given window size c
+// the last chunk may be bigger to accomodate a potential carry from the NAF decomposition
 func computeNbChunks(c uint64) uint64 {
-	// note that we use fr.Bits + 1 --> +1 for a potential carry propagation due to the NAF 
-	// decomposition in partitionScalars
-	nbChunks := (fr.Bits+1) / c
-	if (fr.Bits+1)%c != 0 {
-		nbChunks++
-	}
-	return nbChunks
+	return (fr.Bits+c-1) / c
 }
 
 // return the last window size for a scalar; if c divides the scalar size 
 // then it returns c
 // if not, returns lastC << c
 func lastC(c uint64) uint64 {
-	const n = (fr.Bits + 1) // +1 for the potential carry of the NAF decomposition
-	if n%c == 0 {
-		return c
+	nbAvailableBits := (computeNbChunks(c)*c) - fr.Bits
+	if nbAvailableBits == 0 {
+		// we can push a bit the edge case here;
+		// if the c-msb bits of modulus are not all ones, we have space for the carry
+		// (assuming inputs are smaller than modulus)
+		const qMsb16 = 0b{{binary .Fr.ModulusSixteenMSB}}
+		msbC := qMsb16 >> (16 - c)
+		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
+			nbAvailableBits++
+		}
 	}
-	return n - (c * (n / c))
+	return c+1-nbAvailableBits
 }
 
 type chunkStat struct {
@@ -85,7 +87,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	digits := make([]uint16, len(scalars)*int(nbChunks))
 
 	mask  := uint64((1 << c) - 1) 		// low c bits are 1
-	max := int(1 << (c -1)) 					// max value we want for our digits
+	max := int(1 << (c -1)) - 1					// max value (inclusive) we want for our digits
 	cDivides64 :=  (64 %c ) == 0 				// if c doesn't divide 64, we may need to select over multiple words
 
 
@@ -121,7 +123,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			var carry int
 
 			// for each chunk in the scalar, compute the current digit, and an eventual carry
-			for chunk := uint64(0); chunk < nbChunks; chunk++ {
+			for chunk := uint64(0); chunk < nbChunks - 1; chunk++ {
 				s := selectors[chunk]
 
 				// init with carry if any
@@ -139,7 +141,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit >= max {
+				if digit > max { 
 					digit -= (1 << c)
 					carry = 1
 				}
@@ -157,6 +159,20 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				}
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
+
+			// for the last chunk, we don't want to borrow from a next window 
+			// (but may have a larger max value)
+			chunk := nbChunks - 1
+			s := selectors[chunk]
+			// init with carry if any
+			digit := carry
+			// digit = value of the c-bit window
+			digit += int((scalar[s.index] & s.mask) >> s.shift)
+			if s.multiWordSelect {
+				// we are selecting bits over 2 words
+				digit += int(scalar[s.index+1] & s.maskHigh) << s.shiftHigh
+			}
+			digits[int(chunk)*len(scalars)+i] =  uint16(digit) << 1
 		}
 
 	}, nbTasks)

From e3b29f7537f89c7369b5848adbf41010672f7f85 Mon Sep 17 00:00:00 2001
From: Gautam Botrel <gautam.botrel@gmail.com>
Date: Thu, 17 Nov 2022 13:26:29 -0600
Subject: [PATCH 41/43] fix: add panic in generator when c > 16

---
 ecc/bls12-377/multiexp.go                     | 18 ++++---------
 ecc/bls12-378/multiexp.go                     | 18 ++++---------
 ecc/bls12-381/multiexp.go                     | 18 ++++---------
 ecc/bls24-315/multiexp.go                     | 18 ++++---------
 ecc/bls24-317/multiexp.go                     | 18 ++++---------
 ecc/bn254/multiexp.go                         | 18 ++++---------
 ecc/bw6-633/multiexp.go                       | 26 ++++++++-----------
 ecc/bw6-633/multiexp_affine.go                |  2 ++
 ecc/bw6-633/multiexp_jacobian.go              |  4 +++
 ecc/bw6-633/multiexp_test.go                  |  4 +--
 ecc/bw6-756/multiexp.go                       | 18 ++++---------
 ecc/bw6-761/multiexp.go                       | 18 ++++---------
 internal/field/field.go                       | 13 ----------
 internal/generator/ecc/generate.go            | 23 +++++++---------
 .../generator/ecc/template/multiexp.go.tmpl   | 18 ++++---------
 15 files changed, 74 insertions(+), 160 deletions(-)

diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index c81bd397ad..7e05079bb4 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -561,21 +561,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1001010101011011
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 8e710a39c2..9e1685cca8 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -565,21 +565,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1000001110011110
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index 173d99e6ee..cdf5680ca4 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -561,21 +561,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1110011111011011
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 488bdd5837..4d546776e5 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -561,21 +561,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1100101101101111
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index f5f31de3d4..c92af97379 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -561,21 +561,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1000100001111111
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index 88a5e5a1e2..e7f8fc56bd 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -565,21 +565,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1100000110010001
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index e78cb54082..6dc4bbc779 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -84,7 +84,7 @@ func (p *G1Jac) MultiExp(points []G1Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 12, 16}
+		implementedCs := []uint64{4, 5, 6, 8, 12, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -187,6 +187,8 @@ func getChunkProcessorG1(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 		return processChunkG1Jacobian[bucketg1JacExtendedC4]
 	case 5:
 		return processChunkG1Jacobian[bucketg1JacExtendedC5]
+	case 6:
+		return processChunkG1Jacobian[bucketg1JacExtendedC6]
 	case 8:
 		return processChunkG1Jacobian[bucketg1JacExtendedC8]
 	case 12:
@@ -290,7 +292,7 @@ func (p *G2Jac) MultiExp(points []G2Affine, scalars []fr.Element, config ecc.Mul
 	// we split recursively until nbChunks(c) >= nbTasks,
 	bestC := func(nbPoints int) uint64 {
 		// implemented msmC methods (the c we use must be in this slice)
-		implementedCs := []uint64{4, 5, 8, 12, 16}
+		implementedCs := []uint64{4, 5, 6, 8, 12, 16}
 		var C uint64
 		// approximate cost (in group operations)
 		// cost = bits/c * (nbPoints + 2^{c})
@@ -393,6 +395,8 @@ func getChunkProcessorG2(c uint64, stat chunkStat) func(chunkID uint64, chRes ch
 		return processChunkG2Jacobian[bucketg2JacExtendedC4]
 	case 5:
 		return processChunkG2Jacobian[bucketg2JacExtendedC5]
+	case 6:
+		return processChunkG2Jacobian[bucketg2JacExtendedC6]
 	case 8:
 		return processChunkG2Jacobian[bucketg2JacExtendedC8]
 	case 12:
@@ -455,21 +459,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1001100001000111
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bw6-633/multiexp_affine.go b/ecc/bw6-633/multiexp_affine.go
index fc4e13a30f..ae821f8bbe 100644
--- a/ecc/bw6-633/multiexp_affine.go
+++ b/ecc/bw6-633/multiexp_affine.go
@@ -534,6 +534,7 @@ type qG2AffineC16 [640]batchOpG2Affine
 
 type bitSetC4 [8]bool
 type bitSetC5 [16]bool
+type bitSetC6 [32]bool
 type bitSetC8 [128]bool
 type bitSetC12 [2048]bool
 type bitSetC16 [32768]bool
@@ -541,6 +542,7 @@ type bitSetC16 [32768]bool
 type bitSet interface {
 	bitSetC4 |
 		bitSetC5 |
+		bitSetC6 |
 		bitSetC8 |
 		bitSetC12 |
 		bitSetC16
diff --git a/ecc/bw6-633/multiexp_jacobian.go b/ecc/bw6-633/multiexp_jacobian.go
index 7e44a83aab..2d992da14c 100644
--- a/ecc/bw6-633/multiexp_jacobian.go
+++ b/ecc/bw6-633/multiexp_jacobian.go
@@ -63,6 +63,7 @@ func processChunkG1Jacobian[B ibg1JacExtended](chunk uint64,
 // this allow us to allocate the buckets on the stack
 type bucketg1JacExtendedC4 [8]g1JacExtended
 type bucketg1JacExtendedC5 [16]g1JacExtended
+type bucketg1JacExtendedC6 [32]g1JacExtended
 type bucketg1JacExtendedC8 [128]g1JacExtended
 type bucketg1JacExtendedC12 [2048]g1JacExtended
 type bucketg1JacExtendedC16 [32768]g1JacExtended
@@ -70,6 +71,7 @@ type bucketg1JacExtendedC16 [32768]g1JacExtended
 type ibg1JacExtended interface {
 	bucketg1JacExtendedC4 |
 		bucketg1JacExtendedC5 |
+		bucketg1JacExtendedC6 |
 		bucketg1JacExtendedC8 |
 		bucketg1JacExtendedC12 |
 		bucketg1JacExtendedC16
@@ -122,6 +124,7 @@ func processChunkG2Jacobian[B ibg2JacExtended](chunk uint64,
 // this allow us to allocate the buckets on the stack
 type bucketg2JacExtendedC4 [8]g2JacExtended
 type bucketg2JacExtendedC5 [16]g2JacExtended
+type bucketg2JacExtendedC6 [32]g2JacExtended
 type bucketg2JacExtendedC8 [128]g2JacExtended
 type bucketg2JacExtendedC12 [2048]g2JacExtended
 type bucketg2JacExtendedC16 [32768]g2JacExtended
@@ -129,6 +132,7 @@ type bucketg2JacExtendedC16 [32768]g2JacExtended
 type ibg2JacExtended interface {
 	bucketg2JacExtendedC4 |
 		bucketg2JacExtendedC5 |
+		bucketg2JacExtendedC6 |
 		bucketg2JacExtendedC8 |
 		bucketg2JacExtendedC12 |
 		bucketg2JacExtendedC16
diff --git a/ecc/bw6-633/multiexp_test.go b/ecc/bw6-633/multiexp_test.go
index 242ccf1fc0..f7430a34bd 100644
--- a/ecc/bw6-633/multiexp_test.go
+++ b/ecc/bw6-633/multiexp_test.go
@@ -100,7 +100,7 @@ func TestMultiExpG1(t *testing.T) {
 	))
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 12, 16}
+	cRange := []uint64{4, 5, 6, 8, 12, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
@@ -263,7 +263,7 @@ func TestCrossMultiExpG1(t *testing.T) {
 	}
 
 	// cRange is generated from template and contains the available parameters for the multiexp window size
-	cRange := []uint64{4, 5, 8, 12, 16}
+	cRange := []uint64{4, 5, 6, 8, 12, 16}
 	if testing.Short() {
 		// test only "odd" and "even" (ie windows size divide word size vs not)
 		cRange = []uint64{5, 16}
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index 55da8d2db1..edcb56f599 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -459,21 +459,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1111101110101100
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index ba723c154a..46e37958fa 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -463,21 +463,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits + c - 1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c) * c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b1101011100011101
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c + 1 - nbAvailableBits
 }
 
diff --git a/internal/field/field.go b/internal/field/field.go
index 34783417bf..d9bf4f1a8b 100644
--- a/internal/field/field.go
+++ b/internal/field/field.go
@@ -39,7 +39,6 @@ type FieldConfig struct {
 	ModulusBig                *big.Int
 	Modulus                   string
 	ModulusHex                string
-	ModulusSixteenMSB         uint64 // 16 most significant bits of the modulus, right-aligned.
 	NbWords                   int
 	NbBits                    int
 	NbWordsLastIndex          int
@@ -98,18 +97,6 @@ func NewFieldConfig(packageName, elementName, modulus string, useAddChain bool)
 	F.NbBits = bModulus.BitLen()
 	F.NbWords = len(bModulus.Bits())
 
-	// compute the 16 msb;
-	if F.NbBits <= 16 {
-		F.ModulusSixteenMSB = F.ModulusBig.Uint64()
-	} else {
-		msb := new(big.Int)
-		msb.Rsh(F.ModulusBig, uint(F.NbBits)-16)
-		if msb.BitLen() != 16 {
-			panic("sanity check.")
-		}
-		F.ModulusSixteenMSB = msb.Uint64()
-	}
-
 	F.NbWordsLastIndex = F.NbWords - 1
 
 	// set q from big int repr
diff --git a/internal/generator/ecc/generate.go b/internal/generator/ecc/generate.go
index 8283f912fd..3bf1d94b60 100644
--- a/internal/generator/ecc/generate.go
+++ b/internal/generator/ecc/generate.go
@@ -28,23 +28,20 @@ func Generate(conf config.Curve, baseDir string, bgen *bavard.BatchGenerator) er
 	funcs["last"] = func(x int, a interface{}) bool {
 		return x == reflect.ValueOf(a).Len()-1
 	}
-	funcs["binary"] = func(x uint64) string {
-		return strings.TrimSpace(fmt.Sprintf("%b", x))
-	}
+
+	// return the last window size for a scalar;
+	// this last window should accomodate a carry (from the NAF decomposition)
+	// it can be == c if we have 1 available bit
+	// it can be > c if we have 0 available bit
+	// it can be < c if we have 2+ available bits
 	lastC := func(c int) int {
 		nbChunks := (conf.Fr.NbBits + c - 1) / c
 		nbAvailableBits := (nbChunks * c) - conf.Fr.NbBits
-		if nbAvailableBits == 0 {
-			// we can push a bit the edge case here;
-			// if the c-msb bits of modulus are not all ones, we have space for the carry
-			// (assuming inputs are smaller than modulus)
-			msb16 := conf.Fr.ModulusSixteenMSB
-			msbC := msb16 >> (16 - c)
-			if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-				nbAvailableBits++
-			}
+		lc := c + 1 - nbAvailableBits
+		if lc > 16 {
+			panic("we have a problem since we are using uint16 to store digits")
 		}
-		return c + 1 - nbAvailableBits
+		return lc
 	}
 	batchSize := func(c int) int {
 		// nbBuckets := (1 << (c - 1))
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 19ee187a84..2f892e0c99 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -38,21 +38,13 @@ func computeNbChunks(c uint64) uint64 {
 	return (fr.Bits+c-1) / c
 }
 
-// return the last window size for a scalar; if c divides the scalar size 
-// then it returns c
-// if not, returns lastC << c
+// return the last window size for a scalar;
+// this last window should accomodate a carry (from the NAF decomposition)
+// it can be == c if we have 1 available bit
+// it can be > c if we have 0 available bit
+// it can be < c if we have 2+ available bits
 func lastC(c uint64) uint64 {
 	nbAvailableBits := (computeNbChunks(c)*c) - fr.Bits
-	if nbAvailableBits == 0 {
-		// we can push a bit the edge case here;
-		// if the c-msb bits of modulus are not all ones, we have space for the carry
-		// (assuming inputs are smaller than modulus)
-		const qMsb16 = 0b{{binary .Fr.ModulusSixteenMSB}}
-		msbC := qMsb16 >> (16 - c)
-		if !(msbC&((1<<c)-1) == ((1 << c) - 1)) {
-			nbAvailableBits++
-		}
-	}
 	return c+1-nbAvailableBits
 }
 

From 3e0f0f30f0737aaa5b6a189e604b578eaf51f5a2 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 21 Nov 2022 18:22:29 +0100
Subject: [PATCH 42/43] perf: remove 3 muls by 1 in batchAdd

---
 ecc/bls12-377/g1.go                           |  8 +++++---
 ecc/bls12-377/g2.go                           |  8 +++++---
 ecc/bls12-378/g1.go                           |  8 +++++---
 ecc/bls12-378/g2.go                           |  8 +++++---
 ecc/bls12-381/g1.go                           |  8 +++++---
 ecc/bls12-381/g2.go                           |  8 +++++---
 ecc/bls24-315/g1.go                           |  8 +++++---
 ecc/bls24-315/g2.go                           |  8 +++++---
 ecc/bls24-317/g1.go                           |  8 +++++---
 ecc/bls24-317/g2.go                           |  8 +++++---
 ecc/bn254/g1.go                               |  8 +++++---
 ecc/bn254/g2.go                               |  8 +++++---
 ecc/bw6-633/g1.go                             |  8 +++++---
 ecc/bw6-633/g2.go                             |  8 +++++---
 ecc/bw6-756/g1.go                             |  8 +++++---
 ecc/bw6-756/g2.go                             |  8 +++++---
 ecc/bw6-761/g1.go                             |  8 +++++---
 ecc/bw6-761/g2.go                             |  8 +++++---
 internal/generator/ecc/template/point.go.tmpl | 15 ++++++++-------
 19 files changed, 98 insertions(+), 61 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 910dd07b5e..5a6f659d36 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 0fe9a4119c..011e336c8b 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -990,19 +990,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E2
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E2
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index 67d64790d7..a409696a97 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index 905e2ba893..eb26579bba 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -990,19 +990,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E2
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E2
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index 474c868025..37c71feaae 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -994,19 +994,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index a8575f59f7..d65c640328 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -991,19 +991,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E2
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E2
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index bde8a50d43..cb66154c0f 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -996,19 +996,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 662bfe0313..8ec580405e 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -1006,19 +1006,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E4
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E4
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index cd9452b1ce..50fdef55c9 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -996,19 +996,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 96d823eaf9..72a3631d39 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -1006,19 +1006,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E4
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E4
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index 5bad4b316c..fe0af18997 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -966,19 +966,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index 09011b0c53..c4f6caaf9f 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -995,19 +995,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fptower.E2
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fptower.E2
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index dc2289ac76..ed35639923 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -1098,19 +1098,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index 3d27026424..a045759de4 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -961,19 +961,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index 5cbf001665..d69651047a 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -1098,19 +1098,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index e8b048fb9b..822d56145a 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -955,19 +955,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index d6de060519..9fee8ba8de 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -1109,19 +1109,21 @@ func batchAddG1Affine[TP pG1Affine, TPP ppG1Affine, TC cG1Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index b1b8b664dd..174b33829a 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -969,19 +969,21 @@ func batchAddG2Affine[TP pG2Affine, TPP ppG2Affine, TC cG2Affine](R *TPP, P *TP,
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator fp.Element
-		accumulator.SetOne()
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
 
-		for i := 0; i < batchSize; i++ {
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
 
 		accumulator.Inverse(&accumulator)
 
-		for i := batchSize - 1; i >= 0; i-- {
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d fp.Element
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index c5455ff072..57e40eb2c4 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1586,19 +1586,21 @@ func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TA
 	// invert denominator using montgomery batch invert technique
 	{
 		var accumulator {{.CoordType}}
-		accumulator.SetOne()
-	
-		for i := 0; i < batchSize; i++ {
+		lambda[0].SetOne()
+		accumulator.Set(&lambdain[0])
+
+		for i := 1; i < batchSize; i++ {
 			lambda[i] = accumulator
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
-	
+
 		accumulator.Inverse(&accumulator)
-	
-		for i := batchSize - 1; i >= 0; i-- {
+
+		for i := batchSize - 1; i > 0; i-- {
 			lambda[i].Mul(&lambda[i], &accumulator)
 			accumulator.Mul(&accumulator, &lambdain[i])
 		}
+		lambda[0].Set(&accumulator)
 	}
 
 	var d {{.CoordType}}
@@ -1620,4 +1622,3 @@ func batchAdd{{ $TAffine }}[TP p{{ $TAffine }}, TPP pp{{ $TAffine }}, TC c{{ $TA
 		(*R)[j].Set(&rr)
 	}
 }
-

From 9673409560e5db16885180a462951a74e12a9335 Mon Sep 17 00:00:00 2001
From: Youssef El Housni <youssef.housni21@gmail.com>
Date: Mon, 21 Nov 2022 19:57:02 +0100
Subject: [PATCH 43/43] docs: add comment regarding double(infinity) in ext-Jac

---
 ecc/bls12-377/g1.go                           |  2 +
 ecc/bls12-377/g2.go                           |  2 +
 ecc/bls12-377/multiexp.go                     |  4 --
 ecc/bls12-378/g1.go                           |  2 +
 ecc/bls12-378/g2.go                           |  2 +
 ecc/bls12-378/multiexp.go                     |  4 --
 ecc/bls12-381/g1.go                           |  2 +
 ecc/bls12-381/g2.go                           |  2 +
 ecc/bls12-381/multiexp.go                     |  4 --
 ecc/bls24-315/g1.go                           |  2 +
 ecc/bls24-315/g2.go                           |  2 +
 ecc/bls24-315/multiexp.go                     |  4 --
 ecc/bls24-317/g1.go                           |  2 +
 ecc/bls24-317/g2.go                           |  2 +
 ecc/bls24-317/multiexp.go                     |  4 --
 ecc/bn254/g1.go                               |  2 +
 ecc/bn254/g2.go                               |  2 +
 ecc/bn254/multiexp.go                         |  4 --
 ecc/bw6-633/g1.go                             |  2 +
 ecc/bw6-633/g2.go                             |  2 +
 ecc/bw6-633/multiexp.go                       |  4 --
 ecc/bw6-756/g1.go                             |  2 +
 ecc/bw6-756/g2.go                             |  2 +
 ecc/bw6-756/multiexp.go                       |  4 --
 ecc/bw6-761/g1.go                             |  2 +
 ecc/bw6-761/g2.go                             |  2 +
 ecc/bw6-761/multiexp.go                       |  4 --
 .../generator/ecc/template/multiexp.go.tmpl   | 56 +++++++++----------
 internal/generator/ecc/template/point.go.tmpl |  2 +
 29 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/ecc/bls12-377/g1.go b/ecc/bls12-377/g1.go
index 5a6f659d36..d3c6e729da 100644
--- a/ecc/bls12-377/g1.go
+++ b/ecc/bls12-377/g1.go
@@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bls12-377/g2.go b/ecc/bls12-377/g2.go
index 011e336c8b..51660e48a6 100644
--- a/ecc/bls12-377/g2.go
+++ b/ecc/bls12-377/g2.go
@@ -653,6 +653,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E2
 
diff --git a/ecc/bls12-377/multiexp.go b/ecc/bls12-377/multiexp.go
index 7e05079bb4..13c7dd331e 100644
--- a/ecc/bls12-377/multiexp.go
+++ b/ecc/bls12-377/multiexp.go
@@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls12-378/g1.go b/ecc/bls12-378/g1.go
index a409696a97..eda699043c 100644
--- a/ecc/bls12-378/g1.go
+++ b/ecc/bls12-378/g1.go
@@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bls12-378/g2.go b/ecc/bls12-378/g2.go
index eb26579bba..1c5c7a17b5 100644
--- a/ecc/bls12-378/g2.go
+++ b/ecc/bls12-378/g2.go
@@ -653,6 +653,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E2
 
diff --git a/ecc/bls12-378/multiexp.go b/ecc/bls12-378/multiexp.go
index 9e1685cca8..a52821b9f9 100644
--- a/ecc/bls12-378/multiexp.go
+++ b/ecc/bls12-378/multiexp.go
@@ -275,14 +275,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -536,14 +534,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls12-381/g1.go b/ecc/bls12-381/g1.go
index 37c71feaae..0c71c6b17a 100644
--- a/ecc/bls12-381/g1.go
+++ b/ecc/bls12-381/g1.go
@@ -635,6 +635,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bls12-381/g2.go b/ecc/bls12-381/g2.go
index d65c640328..e999ff48af 100644
--- a/ecc/bls12-381/g2.go
+++ b/ecc/bls12-381/g2.go
@@ -654,6 +654,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E2
 
diff --git a/ecc/bls12-381/multiexp.go b/ecc/bls12-381/multiexp.go
index cdf5680ca4..2d83a2eca2 100644
--- a/ecc/bls12-381/multiexp.go
+++ b/ecc/bls12-381/multiexp.go
@@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls24-315/g1.go b/ecc/bls24-315/g1.go
index cb66154c0f..7a47da919c 100644
--- a/ecc/bls24-315/g1.go
+++ b/ecc/bls24-315/g1.go
@@ -637,6 +637,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bls24-315/g2.go b/ecc/bls24-315/g2.go
index 8ec580405e..06d38bfc1d 100644
--- a/ecc/bls24-315/g2.go
+++ b/ecc/bls24-315/g2.go
@@ -669,6 +669,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E4
 
diff --git a/ecc/bls24-315/multiexp.go b/ecc/bls24-315/multiexp.go
index 4d546776e5..2b440ca494 100644
--- a/ecc/bls24-315/multiexp.go
+++ b/ecc/bls24-315/multiexp.go
@@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bls24-317/g1.go b/ecc/bls24-317/g1.go
index 50fdef55c9..3531d495cb 100644
--- a/ecc/bls24-317/g1.go
+++ b/ecc/bls24-317/g1.go
@@ -637,6 +637,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bls24-317/g2.go b/ecc/bls24-317/g2.go
index 72a3631d39..1bd9fa92c5 100644
--- a/ecc/bls24-317/g2.go
+++ b/ecc/bls24-317/g2.go
@@ -669,6 +669,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E4
 
diff --git a/ecc/bls24-317/multiexp.go b/ecc/bls24-317/multiexp.go
index c92af97379..19d613fb55 100644
--- a/ecc/bls24-317/multiexp.go
+++ b/ecc/bls24-317/multiexp.go
@@ -273,14 +273,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -532,14 +530,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bn254/g1.go b/ecc/bn254/g1.go
index fe0af18997..2682d9c6fe 100644
--- a/ecc/bn254/g1.go
+++ b/ecc/bn254/g1.go
@@ -607,6 +607,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bn254/g2.go b/ecc/bn254/g2.go
index c4f6caaf9f..8026f982db 100644
--- a/ecc/bn254/g2.go
+++ b/ecc/bn254/g2.go
@@ -658,6 +658,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fptower.E2
 
diff --git a/ecc/bn254/multiexp.go b/ecc/bn254/multiexp.go
index e7f8fc56bd..b70a8c4641 100644
--- a/ecc/bn254/multiexp.go
+++ b/ecc/bn254/multiexp.go
@@ -275,14 +275,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -536,14 +534,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-633/g1.go b/ecc/bw6-633/g1.go
index ed35639923..78eeb6e8f8 100644
--- a/ecc/bw6-633/g1.go
+++ b/ecc/bw6-633/g1.go
@@ -664,6 +664,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-633/g2.go b/ecc/bw6-633/g2.go
index a045759de4..a927f9f8a7 100644
--- a/ecc/bw6-633/g2.go
+++ b/ecc/bw6-633/g2.go
@@ -654,6 +654,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-633/multiexp.go b/ecc/bw6-633/multiexp.go
index 6dc4bbc779..c58ed424ab 100644
--- a/ecc/bw6-633/multiexp.go
+++ b/ecc/bw6-633/multiexp.go
@@ -222,14 +222,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -430,14 +428,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-756/g1.go b/ecc/bw6-756/g1.go
index d69651047a..9d54281e86 100644
--- a/ecc/bw6-756/g1.go
+++ b/ecc/bw6-756/g1.go
@@ -664,6 +664,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-756/g2.go b/ecc/bw6-756/g2.go
index 822d56145a..fe6189779b 100644
--- a/ecc/bw6-756/g2.go
+++ b/ecc/bw6-756/g2.go
@@ -648,6 +648,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-756/multiexp.go b/ecc/bw6-756/multiexp.go
index edcb56f599..d2ac51b7ca 100644
--- a/ecc/bw6-756/multiexp.go
+++ b/ecc/bw6-756/multiexp.go
@@ -222,14 +222,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -430,14 +428,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/ecc/bw6-761/g1.go b/ecc/bw6-761/g1.go
index 9fee8ba8de..4f16b8f2a8 100644
--- a/ecc/bw6-761/g1.go
+++ b/ecc/bw6-761/g1.go
@@ -675,6 +675,8 @@ func (p *g1JacExtended) add(q *g1JacExtended) *g1JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g1JacExtended) double(q *g1JacExtended) *g1JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-761/g2.go b/ecc/bw6-761/g2.go
index 174b33829a..dc0fc1483e 100644
--- a/ecc/bw6-761/g2.go
+++ b/ecc/bw6-761/g2.go
@@ -662,6 +662,8 @@ func (p *g2JacExtended) add(q *g2JacExtended) *g2JacExtended {
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *g2JacExtended) double(q *g2JacExtended) *g2JacExtended {
 	var U, V, W, S, XX, M fp.Element
 
diff --git a/ecc/bw6-761/multiexp.go b/ecc/bw6-761/multiexp.go
index 46e37958fa..c17eee4727 100644
--- a/ecc/bw6-761/multiexp.go
+++ b/ecc/bw6-761/multiexp.go
@@ -224,14 +224,12 @@ func msmReduceChunkG1Affine(p *G1Jac, c int, chChunks []chan g1JacExtended) *G1J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
@@ -434,14 +432,12 @@ func msmReduceChunkG2Affine(p *G2Jac, c int, chChunks []chan g2JacExtended) *G2J
 	_p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/internal/generator/ecc/template/multiexp.go.tmpl b/internal/generator/ecc/template/multiexp.go.tmpl
index 2f892e0c99..b6cc7aef5e 100644
--- a/internal/generator/ecc/template/multiexp.go.tmpl
+++ b/internal/generator/ecc/template/multiexp.go.tmpl
@@ -49,19 +49,19 @@ func lastC(c uint64) uint64 {
 }
 
 type chunkStat struct {
-	// relative weight of work compared to other chunks. 100.0 -> nominal weight. 
+	// relative weight of work compared to other chunks. 100.0 -> nominal weight.
 	weight float32
 
-	// // average absolute deviation. this is meant to give a sense of statistical 
+	// // average absolute deviation. this is meant to give a sense of statistical
 	// // dispertion of the scalars[chunk] in the buckets that are hit; (nonZeroBuckets)
-	// deviation int 
+	// deviation int
 
-	// percentage of bucket filled in the window; 
+	// percentage of bucket filled in the window;
 	ppBucketFilled float32
-	nbBucketFilled int 
+	nbBucketFilled int
 
 	// // average ops per non-zero buckets
-	// averageOpsPerBucket int 
+	// averageOpsPerBucket int
 }
 
 
@@ -133,15 +133,15 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 
 				// if the digit is larger than 2^{c-1}, then, we borrow 2^c from the next window and substract
 				// 2^{c} to the current digit, making it negative.
-				if digit > max { 
+				if digit > max {
 					digit -= (1 << c)
 					carry = 1
 				}
 
 				// if digit is zero, no impact on result
 				if digit == 0 {
-					continue 
-				} 
+					continue
+				}
 
 				var bits uint16
 				if digit > 0 {
@@ -152,7 +152,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 				digits[int(chunk)*len(scalars)+i] = bits
 			}
 
-			// for the last chunk, we don't want to borrow from a next window 
+			// for the last chunk, we don't want to borrow from a next window
 			// (but may have a larger max value)
 			chunk := nbChunks - 1
 			s := selectors[chunk]
@@ -168,7 +168,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 		}
 
 	}, nbTasks)
-	
+
 
 	// aggregate  chunk stats
 	chunkStats := make([]chunkStat, nbChunks)
@@ -179,9 +179,9 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 	parallel.Execute(len(chunkStats), func(start, end int) {
 		// for each chunk compute the statistics
 		for chunkID := start; chunkID < end; chunkID++ {
-			// indicates if a bucket is hit. 
-			var b bitSetC16 
-			
+			// indicates if a bucket is hit.
+			var b bitSetC16
+
 			// digits for the chunk
 			chunkDigits := digits[chunkID*len(scalars):(chunkID+1)*len(scalars)]
 
@@ -189,7 +189,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			nz := 0 // non zero buckets count
 			for _, digit := range chunkDigits {
 				if digit == 0 {
-					continue 
+					continue
 				}
 				totalOps++
 				bucketID := digit >> 1
@@ -219,7 +219,7 @@ func partitionScalars(scalars []fr.Element, c uint64, scalarsMont bool, nbTasks
 			chunkStats[i].weight = (chunkStats[i].weight * 100.0) / target
 		}
 	}
-	
+
 
 	return digits, chunkStats
 }
@@ -273,7 +273,7 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas
 	// processing in the msm in 2, to ensure all go routines finish at ~same time
 	// /!\ nbTasks is enough as parallel.Execute is not going to spawn more than nbTasks go routine
 	// if it does, though, this will deadlocK.
-	chSmallValues := make(chan int, nbTasks) 
+	chSmallValues := make(chan int, nbTasks)
 
 	parallel.Execute(len(scalars), func(start, end int) {
 		smallValues := 0
@@ -313,7 +313,7 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas
 
 				// if digit is zero, no impact on result
 				if digit == 0 {
-					continue 
+					continue
 				}
 
 
@@ -342,8 +342,8 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas
 		chSmallValues <- smallValues
 
 	}, nbTasks)
-	
-	
+
+
 	// aggregate small values
 	close(chSmallValues)
 	smallValues := 0
@@ -357,19 +357,19 @@ func partitionScalarsOld(scalars []fr.Element, c uint64, scalarsMont bool, nbTas
 
 
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-// 
+//
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *{{ $.TAffine }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TAffine }}, error) {
 	var _p {{$.TJacobian}}
 	if _, err := _p.MultiExp(points, scalars, config); err != nil {
-		return nil, err 
+		return nil, err
 	}
 	p.FromJacobian(&_p)
 	return p, nil
 }
 
 // MultiExp implements section 4 of https://eprint.iacr.org/2012/549.pdf
-// 
+//
 // This call return an error if len(scalars) != len(points) or if provided config is invalid.
 func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Element, config ecc.MultiExpConfig) (*{{ $.TJacobian }}, error) {
 	// note:
@@ -446,7 +446,7 @@ func (p *{{ $.TJacobian }}) MultiExp(points []{{ $.TAffine }}, scalars []fr.Elem
 		nbTasksPostSplit := nbChunksPostSplit*2
 		if (nbTasksPostSplit <= config.NbTasks /2 ) || ( nbTasksPostSplit - config.NbTasks/2 ) <= ( config.NbTasks - nbChunks) {
 			// if postSplit we still have less tasks than available CPU
-			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split. 
+			// or if we have more tasks BUT the difference of CPU usage is in our favor, we split.
 			config.NbTasks /= 2
 			var _p {{ $.TJacobian }}
 			chDone := make(chan struct{}, 1)
@@ -503,8 +503,8 @@ func _innerMsm{{ $.UPointName }}(p *{{ $.TJacobian }}, c uint64, points []{{ $.T
 				s1.add(&s2)
 				chChunks[chunkID] <- s1
 			}(j)
-			continue 
-		} 
+			continue
+		}
 		go processChunk(uint64(j), chChunks[j], c, points, digits[j*n:(j+1)*n])
 	}
 
@@ -527,7 +527,7 @@ func getChunkProcessor{{ $.UPointName }}(c uint64, stat chunkStat) func(chunkID
 			{{- else}}
 				const batchSize = {{batchSize $c}}
 				// here we could check some chunk statistic (deviation, ...) to determine if calling
-				// the batch affine version is worth it. 
+				// the batch affine version is worth it.
 				if stat.nbBucketFilled < batchSize {
 					// clear indicator that batch affine method is not appropriate here.
 					return processChunk{{ $.UPointName }}Jacobian[bucket{{ $.TJacobianExtended }}C{{$c}}]
@@ -549,14 +549,12 @@ func msmReduceChunk{{ $.TAffine }}(p *{{ $.TJacobian }}, c int, chChunks []chan
     _p.Set(&totalj)
 	for j := len(chChunks) - 2; j >= 0; j-- {
 		for l := 0; l < c; l++ {
-			// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 			_p.double(&_p)
 		}
 		totalj := <-chChunks[j]
 		_p.add(&totalj)
 	}
 
-	// TODO @gbotrel / @yelhousni --> what if _p is infinity here?
 	return p.unsafeFromJacExtended(&_p)
 }
 
diff --git a/internal/generator/ecc/template/point.go.tmpl b/internal/generator/ecc/template/point.go.tmpl
index 57e40eb2c4..24046a3b19 100644
--- a/internal/generator/ecc/template/point.go.tmpl
+++ b/internal/generator/ecc/template/point.go.tmpl
@@ -1162,6 +1162,8 @@ func (p *{{ $TJacobianExtended }}) add(q *{{ $TJacobianExtended }}) *{{ $TJacobi
 
 // double point in Jacobian extended coordinates
 // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1
+// since we consider any point on Z=0 as the point at infinity
+// this doubling formula works for infinity points as well
 func (p *{{ $TJacobianExtended }}) double(q *{{ $TJacobianExtended }}) *{{ $TJacobianExtended }} {
 	var U, V, W, S, XX, M {{.CoordType}}